예제 #1
0
class EPUManagementService(object):
    """EPU management service interface

    See: https://confluence.oceanobservatories.org/display/syseng/CIAD+CEI+OV+Elastic+Computing
    """

    def __init__(self):
        configs = ["service", "epumanagement"]
        config_files = get_config_paths(configs)
        self.CFG = bootstrap.configure(config_files)

        self.dashi = bootstrap.dashi_connect(self.CFG.epumanagement.service_name, self.CFG)

        self.default_user = self.CFG.epumanagement.get('default_user')

        # TODO: create ION class here or depend on epuagent repo as a dep
        ou_client = MockOUAgentClient()

        statsd_cfg = self.CFG.get('statsd')

        if 'mock_provisioner' in self.CFG.epumanagement and \
           self.CFG.epumanagement['mock_provisioner']:
            prov_client = MockProvisionerClient()
        else:
            provisioner_topic = self.CFG.epumanagement.provisioner_service_name
            prov_client = ProvisionerClient(self.dashi, topic=provisioner_topic, statsd_cfg=statsd_cfg,
                                            client_name="epumanagement")

        self.service_name = self.CFG.epumanagement.get(EPUM_INITIALCONF_SERVICE_NAME, EPUM_DEFAULT_SERVICE_NAME)
        self.proc_name = self.CFG.epumanagement.get(EPUM_INITIALCONF_PROC_NAME, None)

        self.store = get_epum_store(self.CFG, service_name=self.service_name,
            proc_name=self.proc_name)
        self.store.initialize()

        dtrs_client = DTRSClient(self.dashi, statsd_cfg=statsd_cfg, client_name=self.CFG.epumanagement.service_name)

        self.epumanagement = EPUManagement(self.CFG.epumanagement, SubscriberNotifier(self.dashi), prov_client,
                                           ou_client, dtrs_client, store=self.store, statsd_cfg=statsd_cfg)

        # hack to inject epum reference for mock prov client
        if isinstance(prov_client, MockProvisionerClient):
            prov_client._set_epum(self.epumanagement)

    def start(self):

        epu.dashiproc.link_dashi_exceptions(self.dashi)

        self.dashi.handle(self.subscribe_domain)
        self.dashi.handle(self.unsubscribe_domain)
        self.dashi.handle(self.add_domain)
        self.dashi.handle(self.remove_domain)
        self.dashi.handle(self.list_domains)
        self.dashi.handle(self.describe_domain)
        self.dashi.handle(self.reconfigure_domain)
        self.dashi.handle(self.add_domain_definition)
        self.dashi.handle(self.remove_domain_definition)
        self.dashi.handle(self.list_domain_definitions)
        self.dashi.handle(self.describe_domain_definition)
        self.dashi.handle(self.update_domain_definition)
        self.dashi.handle(self.ou_heartbeat)
        self.dashi.handle(self.instance_info)

        # this may spawn some background threads
        self.epumanagement.initialize()

        # hack to load some domain definitions at boot. later this should be client driven.
        initial_definitions = self.CFG.epumanagement.initial_definitions
        for definition_id, definition in initial_definitions.iteritems():
            log.info("Loading Domain Definition %s", definition_id)
            try:
                self.epumanagement.msg_add_domain_definition(definition_id, definition)
            except WriteConflictError:
                log.warn("Conflict while loading domain definition. It probably exists.", exc_info=True)
            except Exception:
                log.exception("Failed to load Domain Definition %s", definition_id)

        # hack to load some domains at boot. later this should be client driven.
        initial_domains = self.CFG.epumanagement.initial_domains
        for domain_id, params in initial_domains.iteritems():
            log.info("Loading Domain %s", domain_id)
            definition_id = params['definition']
            config = params['config']
            try:
                self.epumanagement.msg_add_domain(self.default_user, domain_id, definition_id, config)
            except WriteConflictError:
                log.warn("Conflict while loading domain definition. It probably exists.", exc_info=True)
            except Exception:
                log.exception("Failed to load Domain %s", domain_id)

        # blocks til dashi.cancel() is called
        self.dashi.consume()

    @property
    def default_user(self):
        if not self._default_user:
            msg = "Operation called for the default user, but none is defined."
            raise UserNotPermittedError(msg)
        else:
            return self._default_user

    @default_user.setter  # noqa
    def default_user(self, default_user):
        self._default_user = default_user

    def subscribe_domain(self, domain_id, subscriber_name, subscriber_op, caller=None):
        caller = caller or self.default_user

        self.epumanagement.msg_subscribe_domain(caller, domain_id,
            subscriber_name, subscriber_op)

    def unsubscribe_domain(self, domain_id, subscriber_name, caller=None):
        caller = caller or self.default_user

        self.epumanagement.msg_unsubscribe_domain(caller, domain_id, subscriber_name)

    def list_domains(self, caller=None):
        """Return a list of domains in the system
        """
        caller = caller or self.default_user
        return self.epumanagement.msg_list_domains(caller=caller)

    def describe_domain(self, domain_id, caller=None):
        """Return a state structure for a domain, or None
        """
        caller = caller or self.default_user
        return self.epumanagement.msg_describe_domain(caller, domain_id)

    def add_domain(self, domain_id, definition_id, config, subscriber_name=None,
                subscriber_op=None, caller=None):
        caller = caller or self.default_user
        self.epumanagement.msg_add_domain(caller, domain_id, definition_id, config,
            subscriber_name=subscriber_name, subscriber_op=subscriber_op)

    def remove_domain(self, domain_id, caller=None):
        caller = caller or self.default_user
        self.epumanagement.msg_remove_domain(caller, domain_id)

    def reconfigure_domain(self, domain_id, config, caller=None):
        caller = caller or self.default_user
        self.epumanagement.msg_reconfigure_domain(caller, domain_id, config)

    def list_domain_definitions(self):
        return self.epumanagement.msg_list_domain_definitions()

    def describe_domain_definition(self, definition_id):
        return self.epumanagement.msg_describe_domain_definition(definition_id)

    def add_domain_definition(self, definition_id, definition):
        self.epumanagement.msg_add_domain_definition(definition_id, definition)

    def remove_domain_definition(self, definition_id):
        self.epumanagement.msg_remove_domain_definition(definition_id)

    def update_domain_definition(self, definition_id, definition):
        self.epumanagement.msg_update_domain_definition(definition_id, definition)

    def ou_heartbeat(self, heartbeat):
        self.epumanagement.msg_heartbeat(None, heartbeat)  # epum parses

    def instance_info(self, record):
        self.epumanagement.msg_instance_info(None, record)  # epum parses
예제 #2
0
class SubscriberTests(unittest.TestCase):

    def setUp(self):
        # Mock mode:
        initial_conf = {EPUM_INITIALCONF_EXTERNAL_DECIDE: True}
        self.notifier = MockSubscriberNotifier()
        self.provisioner_client = MockProvisionerClient()
        self.dtrs_client = MockDTRSClient()
        self.ou_client = MockOUAgentClient()
        self.epum_store = LocalEPUMStore(EPUM_DEFAULT_SERVICE_NAME)
        self.epum_store.initialize()
        self.epum = EPUManagement(
            initial_conf, self.notifier, self.provisioner_client, self.ou_client,
            self.dtrs_client, store=self.epum_store)

        # For instance-state changes "from the provisioner"
        self.provisioner_client._set_epum(self.epum)

        # For heartbeats "from the OU instance"
        self.ou_client._set_epum(self.epum)

    def _get_simplest_domain_definition(self):
        engine_class = "epu.decisionengine.impls.simplest.SimplestEngine"
        general = {EPUM_CONF_ENGINE_CLASS: engine_class}
        health = {EPUM_CONF_HEALTH_MONITOR: False}
        return {EPUM_CONF_GENERAL: general, EPUM_CONF_HEALTH: health}

    def _config_simplest_domainconf(self, n_preserving, dt="00_dt_id"):
        """Get 'simplest' domain conf with specified NPreserving policy
        """
        engine = {CONF_PRESERVE_N: n_preserving, "epuworker_type": dt}
        return {EPUM_CONF_ENGINE: engine}

    def _reset(self):
        self.notifier.notify_by_name_called = 0
        self.notifier.receiver_names = []
        self.notifier.operations = []
        self.notifier.messages = []

    def _mock_checks(self, num_called, idx_check, subscriber_name, subscriber_op, expected_state, expected_domain):
        self.assertEqual(self.notifier.notify_by_name_called, num_called)
        self.assertEqual(len(self.notifier.receiver_names), num_called)
        self.assertEqual(len(self.notifier.operations), num_called)
        self.assertEqual(len(self.notifier.messages), num_called)
        self.assertEqual(self.notifier.receiver_names[idx_check], subscriber_name)
        self.assertEqual(self.notifier.operations[idx_check], subscriber_op)
        self.assertTrue("state" in self.notifier.messages[idx_check])
        self.assertEqual(self.notifier.messages[idx_check]["state"], expected_state)
        self.assertEqual(self.notifier.messages[idx_check]["domain_id"], expected_domain)

    def test_ignore_subscriber(self):

        self._reset()
        self.epum.initialize()
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 0)
        definition_id = "definition1"
        definition = self._get_simplest_domain_definition()
        self.epum.msg_add_domain_definition(definition_id, definition)
        self.epum.msg_add_domain("owner", "domain1", definition_id, self._config_simplest_domainconf(1))
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 1)
        self.assertEqual(len(self.provisioner_client.launched_instance_ids), 1)
        self.assertEqual(len(self.provisioner_client.deployable_types_launched), 1)
        self.assertEqual(self.provisioner_client.deployable_types_launched[0], "00_dt_id")
        self.assertEqual(self.notifier.notify_by_name_called, 0)

        # Simulate provisioner
        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.RUNNING}
        self.epum.msg_instance_info(None, content)

        self.assertEqual(self.notifier.notify_by_name_called, 0)

    def test_one_subscriber(self):
        subscriber_name = "subscriber01_name"
        subscriber_op = "subscriber01_op"

        self._reset()
        self.epum.initialize()
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 0)
        self.assertEqual(self.provisioner_client.provision_count, 0)
        definition_id = "definition1"
        definition = self._get_simplest_domain_definition()
        self.epum.msg_add_domain_definition(definition_id, definition)
        self.epum.msg_add_domain("owner", "domain1", definition_id, self._config_simplest_domainconf(1))
        self.epum.msg_subscribe_domain("owner", "domain1", subscriber_name, subscriber_op)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 1)
        self.assertEqual(len(self.provisioner_client.launched_instance_ids), 1)
        self.assertEqual(len(self.provisioner_client.deployable_types_launched), 1)
        self.assertEqual(self.notifier.notify_by_name_called, 0)

        # Simulate provisioner
        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.STARTED}
        self.epum.msg_instance_info(None, content)
        self.assertEqual(self.notifier.notify_by_name_called, 0)

        # Running signal should be first notification
        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.RUNNING}
        self.epum.msg_instance_info(None, content)

        self._mock_checks(1, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1")

    def test_multiple_subscribers(self):
        subscriber_name = "subscriber01_name"
        subscriber_op = "subscriber01_op"
        subscriber2_name = "subscriber02_name"
        subscriber2_op = "subscriber02_op"
        subscriber3_name = "subscriber03_name"
        subscriber3_op = "subscriber03_op"

        self._reset()
        self.epum.initialize()
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 0)

        definition_id = "definition1"
        definition = self._get_simplest_domain_definition()
        self.epum.msg_add_domain_definition(definition_id, definition)
        self.epum.msg_add_domain("owner", "domain1", definition_id, self._config_simplest_domainconf(1))
        self.epum.msg_subscribe_domain("owner", "domain1", subscriber_name, subscriber_op)
        self.epum.msg_subscribe_domain("owner", "domain1", subscriber2_name, subscriber2_op)
        self.epum.msg_subscribe_domain("owner", "domain1", subscriber3_name, subscriber3_op)

        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 1)
        self.assertEqual(len(self.provisioner_client.launched_instance_ids), 1)
        self.assertEqual(len(self.provisioner_client.deployable_types_launched), 1)
        self.assertEqual(self.provisioner_client.deployable_types_launched[0], "00_dt_id")
        self.assertEqual(self.notifier.notify_by_name_called, 0)

        # Simulate provisioner
        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.STARTED}
        self.epum.msg_instance_info(None, content)
        self.assertEqual(self.notifier.notify_by_name_called, 0)

        # Running signal should be first notification
        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.RUNNING}
        self.epum.msg_instance_info(None, content)

        self._mock_checks(3, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1")
        self._mock_checks(3, 1, subscriber2_name, subscriber2_op, InstanceState.RUNNING, "domain1")
        self._mock_checks(3, 2, subscriber3_name, subscriber3_op, InstanceState.RUNNING, "domain1")

    def test_multiple_subscribers_multiple_domains(self):
        """Three subscribers, two for one domain, one for another.  One VM for each domain.
        """

        subscriber_name = "subscriber01_name"
        subscriber_op = "subscriber01_op"
        subscriber2_name = "subscriber02_name"
        subscriber2_op = "subscriber02_op"
        subscriber3_name = "subscriber03_name"
        subscriber3_op = "subscriber03_op"

        self._reset()
        self.epum.initialize()
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 0)

        definition_id = "definition1"
        definition = self._get_simplest_domain_definition()
        self.epum.msg_add_domain_definition(definition_id, definition)
        self.epum.msg_add_domain("owner", "domain1", definition_id, self._config_simplest_domainconf(1))
        self.epum.msg_subscribe_domain("owner", "domain1", subscriber_name, subscriber_op)
        self.epum.msg_subscribe_domain("owner", "domain1", subscriber2_name, subscriber2_op)

        # Subscriber 3 is for a different domain
        self.epum.msg_add_domain("owner", "domain2", definition_id, self._config_simplest_domainconf(1, dt="01_dt_id"))
        self.epum.msg_subscribe_domain("owner", "domain2", subscriber3_name, subscriber3_op)

        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 2)
        self.assertEqual(len(self.provisioner_client.launched_instance_ids), 2)
        self.assertEqual(len(self.provisioner_client.deployable_types_launched), 2)

        # Find out which order these were launched ...
        subscriber3_index = -1
        for i, dt_id in enumerate(self.provisioner_client.deployable_types_launched):
            if dt_id == "01_dt_id":
                subscriber3_index = i
        self.assertNotEqual(subscriber3_index, -1)

        # Now we know which was provisioned first... give opposite index to other one
        if subscriber3_index:
            subscriber1and2_index = 0
        else:
            subscriber1and2_index = 1

        self.assertEqual(self.provisioner_client.deployable_types_launched[subscriber1and2_index], "00_dt_id")
        self.assertEqual(self.provisioner_client.deployable_types_launched[subscriber3_index], "01_dt_id")

        # No notifications until RUNNING
        self.assertEqual(self.notifier.notify_by_name_called, 0)

        # Simulate provisioner update for BOTH VMs launched
        content = {"node_id": self.provisioner_client.launched_instance_ids[subscriber1and2_index],
                   "state": InstanceState.STARTED}
        self.epum.msg_instance_info(None, content)
        content = {"node_id": self.provisioner_client.launched_instance_ids[subscriber3_index],
                   "state": InstanceState.STARTED}
        self.epum.msg_instance_info(None, content)
        self.assertEqual(self.notifier.notify_by_name_called, 0)

        # Running signal should be first notification, send RUNNING just for 01_dt_id instance (subscriber 3)
        content = {"node_id": self.provisioner_client.launched_instance_ids[subscriber3_index],
                   "state": InstanceState.RUNNING}
        self.epum.msg_instance_info(None, content)
        self._mock_checks(1, 0, subscriber3_name, subscriber3_op, InstanceState.RUNNING, "domain2")

        # Now for 00_dt_id instance (subscribers 1 and 2)
        content = {"node_id": self.provisioner_client.launched_instance_ids[subscriber1and2_index],
                   "state": InstanceState.RUNNING}
        self.epum.msg_instance_info(None, content)
        self._mock_checks(3, 1, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1")
        self._mock_checks(3, 2, subscriber2_name, subscriber2_op, InstanceState.RUNNING, "domain1")

    def _fail_setup(self):
        subscriber_name = "subscriber01_name"
        subscriber_op = "subscriber01_op"
        self._reset()
        self.epum.initialize()
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 0)
        definition_id = "definition1"
        definition = self._get_simplest_domain_definition()
        self.epum.msg_add_domain_definition(definition_id, definition)
        self.epum.msg_add_domain("owner", "domain1", definition_id, self._config_simplest_domainconf(1))
        self.epum.msg_subscribe_domain("owner", "domain1", subscriber_name, subscriber_op)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 1)
        self.assertEqual(len(self.provisioner_client.launched_instance_ids), 1)
        self.assertEqual(len(self.provisioner_client.deployable_types_launched), 1)
        self.assertEqual(self.provisioner_client.deployable_types_launched[0], "00_dt_id")
        self.assertEqual(self.notifier.notify_by_name_called, 0)

        # Simulate provisioner
        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.STARTED}
        self.epum.msg_instance_info(None, content)
        self.assertEqual(self.notifier.notify_by_name_called, 0)

        # Running signal should be first notification
        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.RUNNING}
        self.epum.msg_instance_info(None, content)

    # The "test_fail*" methods are for checking on notifications after RUNNING.  If the provisioner
    # doesn't 'increase' states, EPUM throws them out, no need to test that scenario.

    def test_fail_650(self):
        subscriber_name = "subscriber01_name"
        subscriber_op = "subscriber01_op"
        self._fail_setup()
        self._mock_checks(1, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1")

        # Failing
        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.RUNNING_FAILED}
        self.epum.msg_instance_info(None, content)

        # All non-RUNNING notifications should be FAILED
        self._mock_checks(2, 1, subscriber_name, subscriber_op, InstanceState.FAILED, "domain1")

    def test_fail_700(self):
        subscriber_name = "subscriber01_name"
        subscriber_op = "subscriber01_op"
        self._fail_setup()
        self._mock_checks(1, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1")

        # Failing
        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.TERMINATING}
        self.epum.msg_instance_info(None, content)

        # All non-RUNNING notifications should be FAILED
        self._mock_checks(2, 1, subscriber_name, subscriber_op, InstanceState.FAILED, "domain1")

    def test_fail_800(self):
        subscriber_name = "subscriber01_name"
        subscriber_op = "subscriber01_op"
        self._fail_setup()
        self._mock_checks(1, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1")

        # Failing
        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.TERMINATED}
        self.epum.msg_instance_info(None, content)

        # All non-RUNNING notifications should be FAILED
        self._mock_checks(2, 1, subscriber_name, subscriber_op, InstanceState.FAILED, "domain1")

    def test_fail_900(self):
        subscriber_name = "subscriber01_name"
        subscriber_op = "subscriber01_op"
        self._fail_setup()
        self._mock_checks(1, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1")

        # Failing
        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.FAILED}
        self.epum.msg_instance_info(None, content)

        # All non-RUNNING notifications should be FAILED
        self._mock_checks(2, 1, subscriber_name, subscriber_op, InstanceState.FAILED, "domain1")

    def test_updated_node_ip(self):
        subscriber_name = "subscriber01_name"
        subscriber_op = "subscriber01_op"

        self._reset()
        self.epum.initialize()
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 0)
        definition_id = "definition1"
        definition = self._get_simplest_domain_definition()
        self.epum.msg_add_domain_definition(definition_id, definition)
        self.epum.msg_add_domain("owner", "domain1", definition_id, self._config_simplest_domainconf(1))
        self.epum.msg_subscribe_domain("owner", "domain1", subscriber_name, subscriber_op)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 1)
        self.assertEqual(len(self.provisioner_client.launched_instance_ids), 1)
        self.assertEqual(len(self.provisioner_client.deployable_types_launched), 1)
        self.assertEqual(self.notifier.notify_by_name_called, 0)

        domain = self.epum_store.get_domain("owner", "domain1")

        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.STARTED,
                   "update_counter": 1}
        self.epum.msg_instance_info(None, content)

        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.RUNNING,
                   "public_ip": "vm-1234",
                   "update_counter": 2}
        self.epum.msg_instance_info(None, content)

        self._mock_checks(1, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1")
        self.assertEqual(domain.get_instance(self.provisioner_client.launched_instance_ids[0]).public_ip, "vm-1234")

        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.RUNNING,
                   "public_ip": "1.2.3.4",
                   "update_counter": 3}
        self.epum.msg_instance_info(None, content)

        self._mock_checks(2, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1")
        self.assertEqual(domain.get_instance(self.provisioner_client.launched_instance_ids[0]).public_ip, "1.2.3.4")

        # Check that sequential update_counter is respected
        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.RUNNING,
                   "public_ip": "localhost",
                   "update_counter": 2}
        self.epum.msg_instance_info(None, content)

        self._mock_checks(2, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1")
        self.assertEqual(domain.get_instance(self.provisioner_client.launched_instance_ids[0]).public_ip, "1.2.3.4")

        # A state going backwards should not happen, but double-check
        content = {"node_id": self.provisioner_client.launched_instance_ids[0],
                   "state": InstanceState.STARTED,
                   "public_ip": "localhost",
                   "update_counter": 4}
        self.epum.msg_instance_info(None, content)

        self._mock_checks(2, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1")
        self.assertEqual(domain.get_instance(self.provisioner_client.launched_instance_ids[0]).public_ip, "1.2.3.4")
예제 #3
0
class EPUManagementBasicTests(unittest.TestCase):
    """
    Tests that cover basic things like running a decision engine cycle and making sure a VM
    is requested, etc.
    """

    def setUp(self):
        # Mock mode:
        initial_conf = {EPUM_INITIALCONF_EXTERNAL_DECIDE: True}
        self.notifier = MockSubscriberNotifier()
        self.provisioner_client = MockProvisionerClient()
        self.ou_client = MockOUAgentClient()
        self.dtrs_client = MockDTRSClient()
        self.epum_store = LocalEPUMStore(EPUM_DEFAULT_SERVICE_NAME)
        self.epum_store.initialize()
        self.epum = EPUManagement(
            initial_conf, self.notifier, self.provisioner_client, self.ou_client,
            self.dtrs_client, store=self.epum_store)

        # For instance-state changes "from the provisioner"
        self.provisioner_client._set_epum(self.epum)

        # For heartbeats "from the OU instance"
        self.ou_client._set_epum(self.epum)

    def _config_mock1(self):
        """Keeps increment count
        """
        engine = {CONF_PRESERVE_N: 1}
        return {EPUM_CONF_ENGINE: engine}

    def _definition_mock1(self):
        general = {EPUM_CONF_ENGINE_CLASS: MOCK_PKG + ".MockDecisionEngine01"}
        health = {EPUM_CONF_HEALTH_MONITOR: False}
        return {EPUM_CONF_GENERAL: general, EPUM_CONF_HEALTH: health}

    def _definition_mock2(self):
        """decide and reconfigure fail
        """
        definition = self._definition_mock1()
        definition[EPUM_CONF_GENERAL] = {EPUM_CONF_ENGINE_CLASS: MOCK_PKG + ".MockDecisionEngine02"}
        return definition

    def _definition_mock3(self):
        """uses Deferred
        """
        definition = self._definition_mock1()
        definition[EPUM_CONF_GENERAL] = {EPUM_CONF_ENGINE_CLASS: MOCK_PKG + ".MockDecisionEngine03"}
        return definition

    def _get_simplest_domain_definition(self):
        engine_class = "epu.decisionengine.impls.simplest.SimplestEngine"
        general = {EPUM_CONF_ENGINE_CLASS: engine_class}
        health = {EPUM_CONF_HEALTH_MONITOR: False}
        return {EPUM_CONF_GENERAL: general, EPUM_CONF_HEALTH: health}

    def _config_simplest_domainconf(self, n_preserving):
        """Get 'simplest' domain conf with specified NPreserving policy
        """
        engine = {CONF_PRESERVE_N: n_preserving}
        return {EPUM_CONF_ENGINE: engine}

    def _config_simplest_chef_domainconf(self, n_preserving, chef_credential):
        """Get 'simplest' domain conf with specified NPreserving policy
        """
        engine = {CONF_PRESERVE_N: n_preserving}
        general = {EPUM_CONF_CHEF_CREDENTIAL: chef_credential}
        return {EPUM_CONF_ENGINE: engine, EPUM_CONF_GENERAL: general}

    def _get_sensor_domain_definition(self):
        engine_class = "epu.decisionengine.impls.sensor.SensorEngine"
        general = {EPUM_CONF_ENGINE_CLASS: engine_class}
        health = {EPUM_CONF_HEALTH_MONITOR: False}
        return {EPUM_CONF_GENERAL: general, EPUM_CONF_HEALTH: health}

    def _config_sensor_domainconf(self, minimum_n):
        """Get 'sensor' domain conf with mock aggregator
        """
        engine = {CONF_SENSOR_TYPE: 'mockcloudwatch',
                  CONF_IAAS_SITE: 'fake',
                  CONF_IAAS_ALLOCATION: 'also.fake',
                  'deployable_type': 'fake',
                 'minimum_vms': minimum_n,
                 'metric': 'load',
                 'monitor_sensors': ['load', ],
                 'monitor_domain_sensors': ['queuelen', ],
                 'sample_function': 'Average'}
        return {EPUM_CONF_ENGINE: engine}

    def test_engine_decide(self):
        """
        Verify decide is called at expected time
        """
        self.epum.initialize()
        definition = self._definition_mock1()
        config = self._config_mock1()
        owner = "owner1"
        domain_id = "testing123"
        definition_id = "def123"
        self.epum.msg_add_domain_definition(definition_id, definition)
        self.epum.msg_add_domain(owner, domain_id, definition_id, config)
        self.epum._run_decisions()

        # digging into internal structure to get engine instances
        engine = self.epum.decider.engines[(owner, domain_id)]
        self.assertNotEqual(engine, None)
        self.assertEqual(engine.initialize_count, 1)
        self.assertEqual(engine.initialize_conf[CONF_PRESERVE_N], 1)
        self.assertEqual(engine.decide_count, 1)
        self.epum._run_decisions()
        self.assertEqual(engine.decide_count, 2)

    def _compare_configs(self, c1, c2):
        self.assertEqual(set(c1.keys()), set(c2.keys()))
        self.assertEqual(c1[EPUM_CONF_GENERAL], c2[EPUM_CONF_GENERAL])
        self.assertEqual(c1[EPUM_CONF_HEALTH], c2[EPUM_CONF_HEALTH])
        self.assertEqual(c1[EPUM_CONF_ENGINE], c2[EPUM_CONF_ENGINE])

    def test_domain_query(self):
        """Verify domain query operations work
        """
        self.epum.initialize()
        caller = "asterix"
        domain1_definition_name = "onedomaindef"
        domain1_definition = self._definition_mock1()
        domain1_config = self._config_mock1()
        domain1_name = "onedomain"
        domain2_definition_name = "twodomaindef"
        domain2_definition = self._get_simplest_domain_definition()
        domain2_config = self._config_simplest_domainconf(1)
        domain2_name = "twodomain"

        domains = self.epum.msg_list_domains(caller)
        self.assertEqual(domains, [])

        self.epum.msg_add_domain_definition(domain1_definition_name, domain1_definition)
        self.epum.msg_add_domain(caller, domain1_name, domain1_definition_name, domain1_config)
        domains = self.epum.msg_list_domains(caller)
        self.assertEqual(domains, [domain1_name])

        domain1_desc = self.epum.msg_describe_domain(caller, domain1_name)
        self.assertEqual(domain1_desc['name'], domain1_name)
        log.debug("domain1 desc: %s", domain1_desc)
        merged_config = copy.copy(domain1_definition)
        merged_config.update(domain1_config)
        self._compare_configs(merged_config, domain1_desc['config'])
        self.assertEqual(domain1_desc['instances'], [])

        self.epum.msg_add_domain_definition(domain2_definition_name, domain2_definition)
        self.epum.msg_add_domain(caller, domain2_name, domain2_definition_name, domain2_config)
        domains = self.epum.msg_list_domains(caller)
        self.assertEqual(set(domains), set([domain1_name, domain2_name]))

        # this will cause domain2 to launch an instance
        self.epum._run_decisions()

        domain2_desc = self.epum.msg_describe_domain(caller, domain2_name)
        self.assertEqual(domain2_desc['name'], domain2_name)
        merged_config = copy.copy(domain2_definition)
        merged_config.update(domain2_config)
        self._compare_configs(merged_config, domain2_desc['config'])
        self.assertEqual(len(domain2_desc['instances']), 1)

        # just make sure it looks roughly like a real instance
        instance = domain2_desc['instances'][0]
        self.assertIn("instance_id", instance)
        self.assertIn("state", instance)

    def test_sensor_data(self):
        self.epum.initialize()
        caller = "asterix"
        domain_definition_name = "twodomaindef"
        domain_definition = self._get_sensor_domain_definition()
        domain_config = self._config_sensor_domainconf(1)
        domain_name = "twodomain"

        domains = self.epum.msg_list_domains(caller)
        self.assertEqual(domains, [])

        self.epum.msg_add_domain_definition(domain_definition_name, domain_definition)
        self.epum.msg_add_domain(caller, domain_name, domain_definition_name, domain_config)
        domains = self.epum.msg_list_domains(caller)
        self.assertEqual(domains, [domain_name])

        domain_desc = self.epum.msg_describe_domain(caller, domain_name)
        self.assertEqual(domain_desc['name'], domain_name)
        log.debug("domain desc: %s", domain_desc)
        merged_config = copy.copy(domain_definition)
        merged_config.update(domain_config)
        self._compare_configs(merged_config, domain_desc['config'])
        self.assertEqual(domain_desc['instances'], [])

        # this will cause domain to launch an instance
        self.epum._run_decisions()

        domain_desc = self.epum.msg_describe_domain(caller, domain_name)
        self.assertEqual(domain_desc['name'], domain_name)
        merged_config = copy.copy(domain_definition)
        merged_config.update(domain_config)
        self._compare_configs(merged_config, domain_desc['config'])
        self.assertEqual(len(domain_desc['instances']), 1)

        # just make sure it looks roughly like a real instance
        instance = domain_desc['instances'][0]
        self.assertIn("instance_id", instance)
        self.assertIn("state", instance)
        self.assertNotIn("sensor_data", instance)
        self.epum._run_decisions()

        domain_desc = self.epum.msg_describe_domain(caller, domain_name)
        self.assertEqual(domain_desc['name'], domain_name)
        merged_config = copy.copy(domain_definition)
        merged_config.update(domain_config)
        self._compare_configs(merged_config, domain_desc['config'])
        self.assertEqual(len(domain_desc['instances']), 1)

        # just make sure it now has sensor_data
        self.assertIn("sensor_data", domain_desc)
        self.assertIn("queuelen", domain_desc['sensor_data'])
        self.assertIn(Statistics.SERIES, domain_desc['sensor_data']['queuelen'])

        instance = domain_desc['instances'][0]
        self.assertIn("instance_id", instance)
        self.assertIn("state", instance)
        self.assertIn("sensor_data", instance)
        self.assertIn("load", instance['sensor_data'])
        self.assertIn(Statistics.SERIES, instance['sensor_data']['load'])

    def test_engine_reconfigure(self):
        """
        Verify reconfigure is called after a 'worker' alters the domain config
        """
        self.epum.initialize()
        domain_definition = self._definition_mock1()
        domain_config = self._config_mock1()
        owner = "emily"
        definition_id = "def123"
        domain_name1 = "testing123"
        domain_name2 = "testing789"
        self.epum.msg_add_domain_definition(definition_id, domain_definition)
        self.epum.msg_add_domain(owner, domain_name1, definition_id, domain_config)
        self.epum.msg_add_domain(owner, domain_name2, definition_id, domain_config)
        self.epum._run_decisions()

        # digging into internal structure to get engine instances
        domain_engine1 = self.epum.decider.engines[(owner, domain_name1)]
        domain_engine2 = self.epum.decider.engines[(owner, domain_name2)]
        self.assertEqual(domain_engine1.decide_count, 1)
        self.assertEqual(domain_engine2.decide_count, 1)

        # reconfigure test
        self.assertEqual(domain_engine1.reconfigure_count, 0)
        self.assertEqual(domain_engine2.reconfigure_count, 0)
        domain_config2 = {EPUM_CONF_ENGINE: {CONF_PRESERVE_N: 2}}
        self.epum.msg_reconfigure_domain(owner, domain_name1, domain_config2)

        # should not take effect immediately, a reconfigure is external msg handled by reactor worker
        self.assertEqual(domain_engine1.reconfigure_count, 0)
        self.assertEqual(domain_engine2.reconfigure_count, 0)

        self.epum._run_decisions()

        # now it should have happened, after a decision cycle, but only to domain_name1
        self.assertEqual(domain_engine1.reconfigure_count, 1)
        self.assertEqual(domain_engine2.reconfigure_count, 0)

        # should not happen again
        self.epum._run_decisions()
        self.assertEqual(domain_engine1.reconfigure_count, 1)
        self.assertEqual(domain_engine2.reconfigure_count, 0)

    def test_basic_npreserving(self):
        """
        Create one domain with NPreserving=2 policy.
        Verify two instances are launched on the first decision cycle.
        """
        self.epum.initialize()
        domain_config = self._config_simplest_domainconf(2)
        definition = {}
        self.epum.msg_add_domain_definition("definition1", definition)
        self.epum.msg_add_domain("owner1", "testing123", "definition1", domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 2)

    def test_basic_chef_domain(self):
        self.epum.initialize()
        domain_config = self._config_simplest_chef_domainconf(2, "chef1")
        definition = {}
        self.epum.msg_add_domain_definition("definition1", definition)
        self.epum.msg_add_domain("owner1", "testing123", "definition1", domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 2)
        # ensure chef credential name is passed through in provisioner vars
        self.assertEqual(self.provisioner_client.launches[0]['vars']['chef_credential'], 'chef1')
        self.assertEqual(self.provisioner_client.launches[1]['vars']['chef_credential'], 'chef1')

    def test_reconfigure_npreserving(self):
        """
        Create one domain with NPreserving=2 policy.
        Verify two instances are launched on the first decision cycle.
        Reconfigure with NPreserving=4 policy.
        Verify two more instances are launched on next decision cycle.
        Reconfigure with NPreserving=0 policy.
        Verify four instances are terminated on next decision cycle.
        """
        self.epum.initialize()
        owner = "opwner1"
        definition_id = "def123"
        definition = self._get_simplest_domain_definition()
        domain_name = "testing123"
        domain_config = self._config_simplest_domainconf(2)

        self.epum.msg_add_domain_definition(definition_id, definition)
        self.epum.msg_add_domain(owner, domain_name, definition_id, domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 2)
        self.assertEqual(self.provisioner_client.terminate_node_count, 0)

        domain_config = self._config_simplest_domainconf(4)
        self.epum.msg_reconfigure_domain(owner, domain_name, domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 4)
        self.assertEqual(self.provisioner_client.terminate_node_count, 0)

        domain_config = self._config_simplest_domainconf(0)
        self.epum.msg_reconfigure_domain(owner, domain_name, domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 4)
        self.assertEqual(self.provisioner_client.terminate_node_count, 4)

    def test_decider_leader_disable(self):
        """
        Create one domain with NPreserving=2 policy.
        Verify two instances are launched on the first decision cycle.
        Change to NPreserving=1, verify that one is terminated on second decision cycle
        Disable leader via epum internals
        Change to NPreserving=4, verify that nothing happened.
        Enable leader via epum internals
        Previous reconfiguration will be recognized

        This will only work in this in-memory situation, otherwise another EPUM worker becomes
        the decider and will respond to reconfigurations.
        """
        self.epum.initialize()
        definition_name = "def123"
        domain_definition = self._get_simplest_domain_definition()
        owner = "opwner1"
        domain_name = "testing123"
        domain_config = self._config_simplest_domainconf(2)

        self.epum.msg_add_domain_definition(definition_name, domain_definition)
        self.epum.msg_add_domain(owner, domain_name, definition_name, domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 2)
        self.assertEqual(self.provisioner_client.terminate_node_count, 0)

        domain_config = self._config_simplest_domainconf(1)
        self.epum.msg_reconfigure_domain(owner, domain_name, domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 2)
        self.assertEqual(self.provisioner_client.terminate_node_count, 1)

        # digging into internal structure to disable leader
        self.epum.epum_store._change_decider(False)

        # nothing should happen now, should stay provision=2, terminate=1
        domain_config = self._config_simplest_domainconf(4)
        self.epum.msg_reconfigure_domain(owner, domain_name, domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 2)
        self.assertEqual(self.provisioner_client.terminate_node_count, 1)

        # digging into internal structure to enable leader
        self.epum.epum_store._change_decider(True)

        # previous reconfiguration (preserve 4) should be recognized if decision cycle runs
        self.epum._run_decisions()

        # 3 more provisions to take from N=1 to N=4 (making 5 total provisions)
        self.assertEqual(self.provisioner_client.provision_count, 5)
        self.assertEqual(self.provisioner_client.terminate_node_count, 1)

    def test_instance_lookup(self):
        """
        Create two domains, run NPreserving=1 in each of them.  Lookup by instance_id and make sure
        the right domain is returned to the caller.  Some incoming service messages, like heartbeats,
        only have the  instance_id to go on (not which domain it belongs to).
        """
        self.epum.initialize()
        definition_id = "definition1"
        definition = self._get_simplest_domain_definition()
        domain_config = self._config_simplest_domainconf(1)
        owner = "owner1"
        domain_name1 = "domain1"
        domain_name2 = "domain2"
        self.epum.msg_add_domain_definition(definition_id, definition)
        self.epum.msg_add_domain(owner, domain_name1, definition_id, domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 1)
        self.assertEqual(len(self.provisioner_client.launched_instance_ids), 1)
        via_domain1 = self.provisioner_client.launched_instance_ids[0]

        self.epum.msg_add_domain(owner, domain_name2, definition_id, domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 2)
        self.assertEqual(len(self.provisioner_client.launched_instance_ids), 2)
        via_domain2 = self.provisioner_client.launched_instance_ids[1]

        domain1 = self.epum.epum_store.get_domain_for_instance_id(via_domain1)
        domain2 = self.epum.epum_store.get_domain_for_instance_id(via_domain2)

        self.assertEqual(domain1.domain_id, domain_name1)
        self.assertEqual(domain2.domain_id, domain_name2)

    def test_decider_retries(self):
        self.epum.initialize()
        definition_id = "definition1"
        definition = self._get_simplest_domain_definition()
        domain_config = self._config_simplest_domainconf(2)
        owner = "owner1"
        domain_name = "domain1"
        self.epum.msg_add_domain_definition(definition_id, definition)
        self.epum.msg_add_domain(owner, domain_name, definition_id, domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 2)
        self.assertEqual(len(self.provisioner_client.launched_instance_ids), 2)

        # sneak into decider internals and patch out retry interval, to speed test
        for controls in self.epum.decider.controls.values():
            controls._retry_seconds = 0.5

        # rerun decisions. no retries should happen
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 2)
        self.assertEqual(len(self.provisioner_client.launched_instance_ids), 2)

        # provide REQUESTED state for first instance. should not retried
        self.provisioner_client.report_node_state(
            InstanceState.REQUESTED,
            self.provisioner_client.launched_instance_ids[0])

        # wait until a retry should be expected
        time.sleep(0.6)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 3)
        self.assertEqual(len(set(self.provisioner_client.launched_instance_ids)), 2)
        self.assertEqual(self.provisioner_client.launched_instance_ids[1],
            self.provisioner_client.launched_instance_ids[2])

        # now kill the instances.
        domain_config = self._config_simplest_domainconf(0)
        self.epum.msg_reconfigure_domain(owner, domain_name, domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 3)
        self.assertEqual(self.provisioner_client.terminate_node_count, 2)
        self.assertEqual(len(self.provisioner_client.terminated_instance_ids), 2)

        # should be no retries immediately
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 3)
        self.assertEqual(self.provisioner_client.terminate_node_count, 2)
        self.assertEqual(len(self.provisioner_client.terminated_instance_ids), 2)

        # provide TERMINATED state for first instance. should not retried
        self.provisioner_client.report_node_state(
            InstanceState.TERMINATED,
            self.provisioner_client.terminated_instance_ids[0])

        # wait until a retry should be expected
        time.sleep(0.6)
        self.epum._run_decisions()
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 3)
        self.assertEqual(self.provisioner_client.terminate_node_count, 3)
        self.assertEqual(len(self.provisioner_client.terminated_instance_ids), 3)
        self.assertEqual(self.provisioner_client.terminated_instance_ids[1],
            self.provisioner_client.terminated_instance_ids[2])

    def test_failing_engine_decide(self):
        """Exceptions during decide cycle should not affect EPUM.
        """
        self.epum.initialize()
        fail_definition = self._definition_mock2()
        fail_definition_id = "fail_definition"
        config = self._config_mock1()
        self.epum.msg_add_domain_definition(fail_definition_id, fail_definition)
        self.epum.msg_add_domain("joeowner", "fail_domain", fail_definition_id, config)
        self.epum._run_decisions()
        # digging into internal structure to get engine instance
        domain_engine = self.epum.decider.engines[("joeowner", "fail_domain")]
        self.assertEqual(domain_engine.decide_count, 1)

    def test_failing_engine_reconfigure(self):
        """Exceptions during engine reconfigure should not affect EPUM.
        """
        self.epum.initialize()
        fail_definition = self._definition_mock2()
        fail_definition_id = "fail_definition"
        config = self._config_mock1()
        self.epum.msg_add_domain_definition(fail_definition_id, fail_definition)
        self.epum.msg_add_domain("owner", "fail_domain", fail_definition_id, config)
        self.epum._run_decisions()

        # digging into internal structure to get engine instance
        domain_engine = self.epum.decider.engines[("owner", "fail_domain")]
        self.assertEqual(domain_engine.decide_count, 1)
        self.assertEqual(domain_engine.reconfigure_count, 0)

        config2 = {EPUM_CONF_ENGINE: {CONF_PRESERVE_N: 2}}
        self.epum.msg_reconfigure_domain("owner", "fail_domain", config2)
        self.epum._run_decisions()
        self.assertEqual(domain_engine.decide_count, 2)
        self.assertEqual(domain_engine.reconfigure_count, 1)

    def test_remove_domain(self):
        """
        Ensure instances are killed when domain is removed
        """
        self.epum.initialize()
        domain_config = self._config_simplest_domainconf(2)
        definition_id = "def123"
        definition = self._get_simplest_domain_definition()
        self.epum.msg_add_domain_definition(definition_id, definition)
        self.epum.msg_add_domain("owner1", "testing123", definition_id, domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 2)

        self.epum.msg_remove_domain("owner1", "testing123")
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.terminate_node_count, 2)

    def test_multiuser(self):
        """Ensure that multiuser checks are working
        """
        permitted_user = "******"
        disallowed_user = "******"

        self.epum.initialize()

        # TODO: test adding with a dt that user doesn't own
        definition_id = "def123"
        definition = self._definition_mock1()
        domain_config = self._config_mock1()
        domain_name = "testing123"
        self.epum.msg_add_domain_definition(definition_id, definition)
        self.epum.msg_add_domain(permitted_user, domain_name, definition_id, domain_config)

        # Test describe
        not_found_error = False
        try:
            self.epum.msg_describe_domain(disallowed_user, domain_name)
        except NotFoundError:
            not_found_error = True
        msg = "Non-permitted user was able to describe an domain he didn't own!"
        self.assertTrue(not_found_error, msg)

        self.epum.msg_describe_domain(permitted_user, domain_name)

        # Test list
        disallowed_domains = self.epum.msg_list_domains(disallowed_user)
        self.assertEqual(len(disallowed_domains), 0)

        permitted_domains = self.epum.msg_list_domains(permitted_user)
        self.assertEqual(len(permitted_domains), 1)

        # Test reconfigure
        new_config = {}
        not_found_error = False
        try:
            self.epum.msg_reconfigure_domain(disallowed_user, domain_name, new_config)
        except NotFoundError:
            not_found_error = True
        msg = "Non-permitted user was able to reconfigure an domain he didn't own!"
        self.assertTrue(not_found_error, msg)

        self.epum.msg_reconfigure_domain(permitted_user, domain_name, new_config)
        # TODO: test adding with a dt that user doesn't own

        # Test Remove
        not_found_error = False
        try:
            self.epum.msg_remove_domain(disallowed_user, domain_name)
        except NotFoundError:
            not_found_error = True
        msg = "Non-permitted user was able to remove an domain he didn't own!"
        self.assertTrue(not_found_error, msg)

        self.epum.msg_remove_domain(permitted_user, domain_name)

    def test_definitions(self):
        self.epum.initialize()

        definition1_name = "definition1"
        definition1 = self._definition_mock1()
        definition2_name = "definition2"
        definition2 = self._definition_mock2()

        self.epum.msg_add_domain_definition(definition1_name, definition1)

        # Trying to add a domain definition with the same name should raise an
        # exception
        try:
            self.epum.msg_add_domain_definition(definition1_name, definition2)
        except WriteConflictError:
            pass
        else:
            self.fail("expected WriteConflictError")

        self.epum.msg_add_domain_definition(definition2_name, definition2)

        definition_one = self.epum.msg_describe_domain_definition(definition1_name)
        self.assertEqual(definition_one['name'], definition1_name)
        self.assertEqual(definition_one['definition'], definition1)

        definition_two = self.epum.msg_describe_domain_definition(definition2_name)
        self.assertEqual(definition_two['name'], definition2_name)
        self.assertEqual(definition_two['definition'], definition2)

        definitions = self.epum.msg_list_domain_definitions()
        self.assertEqual(len(definitions), 2)
        self.assertIn(definition1_name, definitions)
        self.assertIn(definition2_name, definitions)

        self.epum.msg_remove_domain_definition(definition1_name)
        try:
            self.epum.msg_describe_domain_definition(definition1_name)
        except NotFoundError:
            pass
        else:
            self.fail("expected NotFoundError")

        try:
            self.epum.msg_remove_domain_definition(definition1_name)
        except NotFoundError:
            pass
        else:
            self.fail("expected NotFoundError")

        self.epum.msg_update_domain_definition(definition2_name, definition1)
        definition_two = self.epum.msg_describe_domain_definition(definition2_name)
        self.assertEqual(definition_two['name'], definition2_name)
        self.assertEqual(definition_two['definition'], definition1)

    def test_config_validation(self):
        caller = "asterix"
        self.epum.initialize()

        definition_name = "def123"
        definition = self._get_simplest_domain_definition()

        wrong_config = {EPUM_CONF_ENGINE: {}}
        ok_config = self._config_simplest_domainconf(1)

        self.epum.msg_add_domain_definition(definition_name, definition)

        # Trying to add a domain using a config with missing parameters should
        # raise an exception
        try:
            self.epum.msg_add_domain(caller, "domain", definition_name, wrong_config)
        except ValueError:
            pass
        else:
            self.fail("expected ValueError")

        self.epum.msg_add_domain(caller, "domain", definition_name, ok_config)

    def test_engine_config_doc(self):
        self.epum.initialize()

        definition_name = "def123"
        definition = self._get_simplest_domain_definition()

        self.epum.msg_add_domain_definition(definition_name, definition)
        desc = self.epum.msg_describe_domain_definition(definition_name)
        self.assertTrue("documentation" in desc)

    def test_reaper(self):
        self.epum.initialize()
        config = self._config_mock1()
        owner = "owner1"
        domain_id = "testing123"

        # inject the FakeState instance directly instead of using msg_add_domain()
        self.state = FakeDomainStore(owner, domain_id, config)
        self.epum.epum_store.domains[(owner, domain_id)] = self.state

        now = time.time()

        # One running
        self.state.new_fake_instance_state("n1", InstanceState.RUNNING, now - EPUM_RECORD_REAPING_DEFAULT_MAX_AGE - 1)

        # Three in terminal state and outdated
        self.state.new_fake_instance_state(
            "n2", InstanceState.TERMINATED, now - EPUM_RECORD_REAPING_DEFAULT_MAX_AGE - 1)
        self.state.new_fake_instance_state("n3", InstanceState.REJECTED, now - EPUM_RECORD_REAPING_DEFAULT_MAX_AGE - 1)
        self.state.new_fake_instance_state("n4", InstanceState.FAILED, now - EPUM_RECORD_REAPING_DEFAULT_MAX_AGE - 1)

        # Three in terminal state and not yet outdated
        self.state.new_fake_instance_state(
            "n5", InstanceState.TERMINATED, now - EPUM_RECORD_REAPING_DEFAULT_MAX_AGE + 60)
        self.state.new_fake_instance_state("n6", InstanceState.REJECTED, now - EPUM_RECORD_REAPING_DEFAULT_MAX_AGE + 60)
        self.state.new_fake_instance_state("n7", InstanceState.FAILED, now - EPUM_RECORD_REAPING_DEFAULT_MAX_AGE + 60)

        self.epum._run_reaper_loop()
        instances = self.state.get_instance_ids()
        self.assertEqual(len(instances), 4)
        self.assertIn("n1", instances)
        self.assertIn("n5", instances)
        self.assertIn("n6", instances)
        self.assertIn("n7", instances)

    def test_instance_update_conflict_1(self):

        self.epum.initialize()
        domain_config = self._config_simplest_domainconf(1)
        definition = {}
        self.epum.msg_add_domain_definition("definition1", definition)
        self.epum.msg_add_domain("owner1", "testing123", "definition1", domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 1)

        domain = self.epum_store.get_domain("owner1", "testing123")

        instance_id = self.provisioner_client.launched_instance_ids[0]
        self.provisioner_client.launches[0]['launch_id']

        sneaky_msg = dict(node_id=instance_id, state=InstanceState.PENDING)

        # patch in a function that sneaks in an instance record update just
        # before a requested update. This simulates the case where two EPUM
        # workers are competing to update the same instance.
        original_new_instance_state = domain.new_instance_state

        patch_called = threading.Event()

        def patched_new_instance_state(content, timestamp=None, previous=None):
            patch_called.set()

            # unpatch ourself first so we don't recurse forever
            domain.new_instance_state = original_new_instance_state

            domain.new_instance_state(sneaky_msg, previous=previous)
            return domain.new_instance_state(content, timestamp=timestamp, previous=previous)
        domain.new_instance_state = patched_new_instance_state

        # send our "real" update. should get a conflict
        msg = dict(node_id=instance_id, state=InstanceState.STARTED)

        self.epum.msg_instance_info("owner1", msg)

        assert patch_called.is_set()

        instance = domain.get_instance(instance_id)
        self.assertEqual(instance.state, InstanceState.STARTED)

    def test_instance_update_conflict_2(self):

        self.epum.initialize()
        domain_config = self._config_simplest_domainconf(1)
        definition = {}
        self.epum.msg_add_domain_definition("definition1", definition)
        self.epum.msg_add_domain("owner1", "testing123", "definition1", domain_config)
        self.epum._run_decisions()
        self.assertEqual(self.provisioner_client.provision_count, 1)

        domain = self.epum_store.get_domain("owner1", "testing123")

        instance_id = self.provisioner_client.launched_instance_ids[0]
        self.provisioner_client.launches[0]['launch_id']

        sneaky_msg = dict(node_id=instance_id, state=InstanceState.STARTED)

        # patch in a function that sneaks in an instance record update just
        # before a requested update. This simulates the case where two EPUM
        # workers are competing to update the same instance.
        original_new_instance_state = domain.new_instance_state

        patch_called = threading.Event()

        def patched_new_instance_state(content, timestamp=None, previous=None):
            patch_called.set()

            # unpatch ourself first so we don't recurse forever
            domain.new_instance_state = original_new_instance_state

            domain.new_instance_state(sneaky_msg, previous=previous)
            return domain.new_instance_state(content, timestamp=timestamp, previous=previous)
        domain.new_instance_state = patched_new_instance_state

        # send our "real" update. should get a conflict
        msg = dict(node_id=instance_id, state=InstanceState.PENDING)

        self.epum.msg_instance_info(None, msg)

        assert patch_called.is_set()

        # in this case the sneaky message (STARTED) should win because it is
        # the later state
        instance = domain.get_instance(instance_id)
        self.assertEqual(instance.state, InstanceState.STARTED)
예제 #4
0
class HeartbeatMonitorTests(unittest.TestCase):
    def setUp(self):
        self.domain_name = "epuX"
        self.domain_owner = "david"
        self.domain_key = (self.domain_owner, self.domain_name)
        config = self._dom_config(health_init_time=100)
        self.state = FakeDomainStore(self.domain_owner, self.domain_name, config)

        initial_conf = {EPUM_INITIALCONF_EXTERNAL_DECIDE: True}
        self.notifier = MockSubscriberNotifier()
        self.provisioner_client = MockProvisionerClient()
        self.dtrs_client = MockDTRSClient()
        self.ou_client = MockOUAgentClient()
        self.epum_store = LocalEPUMStore(EPUM_DEFAULT_SERVICE_NAME)
        self.epum_store.initialize()
        self.epum = EPUManagement(
            initial_conf, self.notifier, self.provisioner_client,
            self.ou_client, self.dtrs_client, store=self.epum_store)
        self.provisioner_client._set_epum(self.epum)
        self.ou_client._set_epum(self.epum)

        # inject the FakeState instance directly instead of using msg_add_domain()
        self.epum.epum_store.domains[(self.domain_owner, self.domain_name)] = self.state

    def _dom_config(self, health_init_time=0):
        general = {EPUM_CONF_ENGINE_CLASS: MOCK_PKG + ".MockDecisionEngine01"}
        health = {EPUM_CONF_HEALTH_MONITOR: True, EPUM_CONF_HEALTH_BOOT: 10,
                  EPUM_CONF_HEALTH_MISSING: 5, EPUM_CONF_HEALTH_ZOMBIE: 10,
                  EPUM_CONF_HEALTH_REALLY_MISSING: 3,
                  TESTCONF_HEALTH_INIT_TIME: health_init_time}
        engine = {CONF_PRESERVE_N: 1}
        return {EPUM_CONF_GENERAL: general, EPUM_CONF_ENGINE: engine, EPUM_CONF_HEALTH: health}

    def test_recovery(self):
        self.epum.initialize()
        dom_config = self._dom_config(health_init_time=100)
        self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, dom_config)

        nodes = ["n" + str(i + 1) for i in range(7)]
        n1, n2, n3, n4, n5, n6, n7 = nodes

        # set up some instances that reached their iaas_state before the
        # init time (100)

        # this one has been running for well longer than the missing timeout
        # and we will have not received a heartbeat. It shouldn't be marked
        # OUT_OF_CONTACT until more than 5 seconds after the init_time
        self.state.new_fake_instance_state(n1, InstanceState.RUNNING, 50,
                                           InstanceHealthState.OK)

        # this has been running for 10 seconds before the init time but we
        # have never received a heartbeat. It should be marked as OUT_OF_CONTACT
        # after the boot timeout expires, starting from the init time.
        self.state.new_fake_instance_state(n2, InstanceState.RUNNING, 90,
                                           InstanceHealthState.UNKNOWN)

        # is terminated and nothing should happen
        self.state.new_fake_instance_state(n3, InstanceState.TERMINATED, 90,
                                           InstanceHealthState.UNKNOWN)

        # this one will get a heartbeat at 110, just before it would be
        # marked OUT_OF_CONTACT
        self.state.new_fake_instance_state(n4, InstanceState.RUNNING, 95,
                                           InstanceHealthState.UNKNOWN)

        # this one will get a heartbeat at 105, just before it would be
        # marked OUT_OF_CONTACT
        self.state.new_fake_instance_state(n5, InstanceState.RUNNING, 95,
                                           InstanceHealthState.OK)

        # this instance was already marked as errored before the recovery
        self.state.new_fake_instance_state(n6, InstanceState.RUNNING, 95,
                                           InstanceHealthState.PROCESS_ERROR)

        # this instance was a ZOMBIE, it should be initially marked back as
        # UNKNOWN and then if a heartbeat arrives it should be ZOMBIE again
        self.state.new_fake_instance_state(n7, InstanceState.TERMINATED, 80,
                                           InstanceHealthState.ZOMBIE)

        self.epum._doctor_appt(100)
        self.assertNodeState(InstanceHealthState.OK, n1, n5)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)

        self.epum._doctor_appt(105)
        self.assertNodeState(InstanceHealthState.OK, n1, n5)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)

        self.ok_heartbeat(n5, 105)
        self.ok_heartbeat(n7, 105)  # this one will be relabeled as a zombie

        self.err_heartbeat(n6, 105, procs=['a'])
        self.epum._doctor_appt(106)
        self.assertNodeState(InstanceHealthState.OK, n5)
        self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)

        self.ok_heartbeat(n5, 110)
        self.epum._doctor_appt(110)
        self.assertNodeState(InstanceHealthState.OK, n5)

        # n1 has now been "out of contact" too long and is past the "really missing"
        # threshold, so it should now be MISSING
        self.assertNodeState(InstanceHealthState.MISSING, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)

        self.ok_heartbeat(n4, 110)
        self.err_heartbeat(n6, 110, procs=['a'])
        self.epum._doctor_appt(111)
        self.assertNodeState(InstanceHealthState.OK, n5, n4)
        self.assertNodeState(InstanceHealthState.MISSING, n1)
        self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n2)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n3)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)

    def test_basic(self):
        self.epum.initialize()
        self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, self._dom_config())

        nodes = [str(uuid.uuid4()) for i in range(3)]
        n1, n2, n3 = nodes

        # not using real timestamps
        now = 0

        for n in nodes:
            self.state.new_fake_instance_state(n, InstanceState.RUNNING, now)

        # all nodes are running but haven't been heard from
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)
        self.epum._doctor_appt(now)
        self.assertEquals(0, self.epum.doctor.monitors[self.domain_key].init_time)
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)

        now = 5
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)

        # first heartbeat to n1
        self.ok_heartbeat(n1, now)
        self.assertNodeState(InstanceHealthState.OK, n1)

        now = 10
        self.epum._doctor_appt(now)

        self.assertNodeState(InstanceHealthState.OK, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3)

        self.ok_heartbeat(n1, now)  # n1 makes it in under the wire
        self.ok_heartbeat(n2, now)
        now = 11
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.OK, n1, n2)
        self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n3)

        self.ok_heartbeat(n3, now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        # ok don't hear from n2 for a while, should go missing
        now = 13
        self.ok_heartbeat(n1, now)

        now = 16
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.OK, n1, n3)
        self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n2)

        self.ok_heartbeat(n2, now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 20

        # roll all nodes to terminated in IaaS
        for n in nodes:
            self.state.new_fake_instance_state(n, InstanceState.TERMINATED, now)

        # been longer than missing window for n1 but shouldn't matter
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 30
        self.ok_heartbeat(n1, now)
        self.epum._doctor_appt(now)
        # not a zombie yet
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 31
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.OK, n1)

        self.ok_heartbeat(n1, now)
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n1)

        now = 42
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n1)

    def test_error(self):
        self.epum.initialize()
        self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, self._dom_config())

        node = str(uuid.uuid4())

        now = 1
        self.state.new_fake_instance_state(node, InstanceState.RUNNING, now)
        self.ok_heartbeat(node, now)
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.OK, node)

        now = 5
        self.err_heartbeat(node, now)
        self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0], 'faiiiill')

        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node)

    def test_process_error(self):
        self.epum.initialize()
        self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, self._dom_config())

        node = str(uuid.uuid4())

        now = 1
        self.state.new_fake_instance_state(node, InstanceState.RUNNING, now)
        self.ok_heartbeat(node, now)
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.OK, node)

        now = 5
        procs = [{'name': 'proc1', 'stderr': 'faaaaaail', 'state': 100,
                  'exitcode': -1, 'stop_timestamp': 25242}]
        self.err_heartbeat(node, now, procs)
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0]['stderr'], 'faaaaaail')
        procs[0].pop('stderr')

        now = 8
        self.err_heartbeat(node, now, procs)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0]['stderr'], 'faaaaaail')

    def test_defibulator(self):
        self.epum.initialize()
        dom_config = self._dom_config(health_init_time=100)
        self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, dom_config)

        self.ou_client.dump_state_called = 0
        self.ou_client.heartbeats_sent = 0
        self.ou_client.respond_to_dump_state = True

        # set up an instance that reached its iaas_state before the init time (100)
        n1 = "n1"

        # has been running for well longer than the missing timeout and we will
        # have not received a heartbeat. It shouldn't be marked OUT_OF_CONTACT
        # until more than 5 seconds after the init_time
        self.state.new_fake_instance_state(n1, InstanceState.RUNNING, 50,
                                           InstanceHealthState.OK)

        self.epum._doctor_appt(100)
        self.assertNodeState(InstanceHealthState.OK, n1)
        self.epum._doctor_appt(105)
        self.assertNodeState(InstanceHealthState.OK, n1)

        self.assertEquals(0, self.ou_client.dump_state_called)
        self.assertEquals(0, self.ou_client.heartbeats_sent)
        self.epum._doctor_appt(106)
        # back to OK
        self.assertNodeState(InstanceHealthState.OK, n1)
        self.assertEquals(1, self.ou_client.dump_state_called)
        self.assertEquals(1, self.ou_client.heartbeats_sent)

    def test_defibulator_failure(self):
        self.epum.initialize()
        dom_config = self._dom_config(health_init_time=100)
        self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, dom_config)

        self.ou_client.dump_state_called = 0
        self.ou_client.heartbeats_sent = 0
        self.ou_client.respond_to_dump_state = False  # i.e., the node is really gone

        # set up an instance that reached its iaas_state before the init time (100)
        n1 = "Poor Yorick"

        # has been running for well longer than the missing timeout and we will
        # have not received a heartbeat. It shouldn't be marked OUT_OF_CONTACT
        # until more than 5 seconds after the init_time
        self.state.new_fake_instance_state(n1, InstanceState.RUNNING, 50,
                                           InstanceHealthState.OK)

        self.epum._doctor_appt(100)
        self.assertNodeState(InstanceHealthState.OK, n1)
        self.epum._doctor_appt(105)
        self.assertNodeState(InstanceHealthState.OK, n1)

        self.assertEquals(0, self.ou_client.dump_state_called)
        self.assertEquals(0, self.ou_client.heartbeats_sent)
        self.epum._doctor_appt(106)
        self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n1)
        self.assertEquals(1, self.ou_client.dump_state_called)
        self.assertEquals(0, self.ou_client.heartbeats_sent)

        self.epum._doctor_appt(110)
        self.assertNodeState(InstanceHealthState.MISSING, n1)
        self.assertEquals(1, self.ou_client.dump_state_called)
        self.assertEquals(0, self.ou_client.heartbeats_sent)

    # ----------------------------------------------------------------------------------

    def assertNodeState(self, state, *node_ids):
        for n in node_ids:
            self.assertEqual(state, self.state.instances[n].health)

    def ok_heartbeat(self, node_id, timestamp):
        msg = {'node_id': node_id, 'timestamp': timestamp,
            'state': InstanceHealthState.OK}
        self.epum.msg_heartbeat(None, msg, timestamp=timestamp)

    def err_heartbeat(self, node_id, timestamp, procs=None):

        msg = {'node_id': node_id, 'timestamp': timestamp, }
        if procs:
            msg['state'] = InstanceHealthState.PROCESS_ERROR
            msg['failed_processes'] = procs
        else:
            msg['state'] = InstanceHealthState.MONITOR_ERROR
            msg['error'] = 'faiiiill'

        self.epum.msg_heartbeat(None, msg, timestamp=timestamp)