Пример #1
0
    def __init__(self,
                 provisioner_client,
                 engineclass,
                 controller_name,
                 conf=None,
                 state=None,
                 store=None):

        if state:
            self.state = state
        else:
            self.state = ControllerCoreState(store or ControllerStore())

        prov_vars = None
        health_kwargs = None
        if conf:
            if conf.has_key(PROVISIONER_VARS_KEY):
                prov_vars = conf[PROVISIONER_VARS_KEY]

            if conf.get(MONITOR_HEALTH_KEY):
                health_kwargs = {}
                if HEALTH_BOOT_KEY in conf:
                    health_kwargs['boot_seconds'] = conf[HEALTH_BOOT_KEY]
                if HEALTH_MISSING_KEY in conf:
                    health_kwargs['missing_seconds'] = conf[HEALTH_MISSING_KEY]
                if HEALTH_ZOMBIE_KEY in conf:
                    health_kwargs['zombie_seconds'] = conf[HEALTH_ZOMBIE_KEY]
        self.conf = conf

        if health_kwargs is not None:
            self.health_monitor = HealthMonitor(self.state, **health_kwargs)
        else:
            self.health_monitor = None

        # There can only ever be one 'reconfigure' or 'decide' engine call run
        # at ANY time.  The 'decide' call is triggered via timed looping call
        # and 'reconfigure' is triggered asynchronously at any moment.
        self.busy = defer.DeferredSemaphore(1)

        self.provisioner_client = provisioner_client

        health_not_checked = self.health_monitor is None
        self.control = ControllerCoreControl(
            provisioner_client,
            self.state,
            prov_vars,
            controller_name,
            health_not_checked=health_not_checked)
        self.engine = EngineLoader().load(engineclass)

        self.control_loop = None
Пример #2
0
    def test_process_error(self):
        self.monitor = HealthMonitor(self.state,
                                     boot_seconds=10,
                                     missing_seconds=5,
                                     zombie_seconds=10,
                                     init_time=0)

        node = str(uuid.uuid4())

        now = 1
        self.state.new_fake_instance_state(node, InstanceStates.RUNNING, now)
        yield self.ok_heartbeat(node, now)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, node)

        now = 5
        procs = [{
            'name': 'proc1',
            'stderr': 'faaaaaail',
            'state': 100,
            'exitcode': -1,
            'stop_timestamp': 25242
        }]
        yield self.err_heartbeat(node, now, procs)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0]['stderr'], 'faaaaaail')
        procs[0].pop('stderr')

        now = 8
        yield self.err_heartbeat(node, now, procs)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0]['stderr'], 'faaaaaail')
Пример #3
0
    def test_error(self):
        self.monitor = HealthMonitor(self.state,
                                     boot_seconds=10,
                                     missing_seconds=5,
                                     zombie_seconds=10,
                                     init_time=0)

        node = str(uuid.uuid4())

        now = 1
        self.state.new_fake_instance_state(node, InstanceStates.RUNNING, now)
        yield self.ok_heartbeat(node, now)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, node)

        now = 5
        yield self.err_heartbeat(node, now)
        self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0], 'faiiiill')

        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node)
Пример #4
0
    def test_recovery(self):

        self.monitor = HealthMonitor(self.state,
                                     boot_seconds=10,
                                     missing_seconds=5,
                                     zombie_seconds=10,
                                     init_time=100)
        nodes = ["n" + str(i + 1) for i in range(7)]
        n1, n2, n3, n4, n5, n6, n7 = nodes

        # set up some instances that reached their iaas_state before the
        # init time (100)

        # this one has been running for well longer than the missing timeout
        # and we will have not received a heartbeat. It shouldn't be marked
        # MISSING until more than 5 seconds after the init_time
        self.state.new_fake_instance_state(n1, InstanceStates.RUNNING, 50,
                                           InstanceHealthState.OK)

        # this has been running for 10 seconds before the init time but we
        # have never received a heartbeat. It should be marked as MISSING
        # after the boot timeout expires, starting from the init time.
        self.state.new_fake_instance_state(n2, InstanceStates.RUNNING, 90,
                                           InstanceHealthState.UNKNOWN)

        # is terminated and nothing should happen
        self.state.new_fake_instance_state(n3, InstanceStates.TERMINATED, 90,
                                           InstanceHealthState.UNKNOWN)

        # this one will get a heartbeat at 110, just before it would be
        # marked MISSING
        self.state.new_fake_instance_state(n4, InstanceStates.RUNNING, 95,
                                           InstanceHealthState.UNKNOWN)

        # this one will get a heartbeat at 105, just before it would be
        # marked MISSING
        self.state.new_fake_instance_state(n5, InstanceStates.RUNNING, 95,
                                           InstanceHealthState.OK)

        # this instance was already marked as errored before the recovery
        self.state.new_fake_instance_state(n6, InstanceStates.RUNNING, 95,
                                           InstanceHealthState.PROCESS_ERROR)

        # this instance was a ZOMBIE, it should be initially marked back as
        # UNKNOWN and then if a heartbeat arrives it should be ZOMBIE again
        self.state.new_fake_instance_state(n7, InstanceStates.TERMINATED, 80,
                                           InstanceHealthState.ZOMBIE)

        yield self.monitor.update(100)
        self.assertNodeState(InstanceHealthState.OK, n1, n5)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)

        yield self.monitor.update(105)
        self.assertNodeState(InstanceHealthState.OK, n1, n5)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)

        self.ok_heartbeat(n5, 105)
        self.ok_heartbeat(n7, 105)  # this one will be relabeled as a zombie

        self.err_heartbeat(n6, 105, procs=['a'])
        yield self.monitor.update(106)
        self.assertNodeState(InstanceHealthState.OK, n5)
        self.assertNodeState(InstanceHealthState.MISSING, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)

        self.ok_heartbeat(n5, 110)
        yield self.monitor.update(110)
        self.assertNodeState(InstanceHealthState.OK, n5)
        self.assertNodeState(InstanceHealthState.MISSING, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)

        self.ok_heartbeat(n4, 110)
        self.err_heartbeat(n6, 110, procs=['a'])
        yield self.monitor.update(111)
        self.assertNodeState(InstanceHealthState.OK, n5, n4)
        self.assertNodeState(InstanceHealthState.MISSING, n1, n2)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n3)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)
Пример #5
0
    def test_basic(self):
        self.monitor = HealthMonitor(self.state,
                                     boot_seconds=10,
                                     missing_seconds=5,
                                     zombie_seconds=10,
                                     init_time=0)

        nodes = [str(uuid.uuid4()) for i in range(3)]
        n1, n2, n3 = nodes

        # not using real timestamps
        now = 0

        for n in nodes:
            self.state.new_fake_instance_state(n, InstanceStates.RUNNING, now)

        # all nodes are running but haven't been heard from
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)

        now = 5
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)

        # first heartbeat to n1
        yield self.ok_heartbeat(n1, now)
        self.assertNodeState(InstanceHealthState.OK, n1)

        now = 10
        yield self.monitor.update(now)

        self.assertNodeState(InstanceHealthState.OK, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3)

        yield self.ok_heartbeat(n1, now)  # n1 makes it in under the wire
        yield self.ok_heartbeat(n2, now)
        now = 11
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, n1, n2)
        self.assertNodeState(InstanceHealthState.MISSING, n3)

        yield self.ok_heartbeat(n3, now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        # ok don't hear from n2 for a while, should go missing
        now = 13
        yield self.ok_heartbeat(n1, now)

        now = 16
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, n1, n3)
        self.assertNodeState(InstanceHealthState.MISSING, n2)

        yield self.ok_heartbeat(n2, now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 20

        # roll all nodes to terminated in IaaS
        for n in nodes:
            self.state.new_fake_instance_state(n, InstanceStates.TERMINATED,
                                               now)

        # been longer than missing window for n1 but shouldn't matter
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 30
        yield self.ok_heartbeat(n1, now)
        yield self.monitor.update(now)
        # not a zombie yet
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 31
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, n1)

        yield self.ok_heartbeat(n1, now)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n1)

        now = 42
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n1)