class EPUManagementService(object): """EPU management service interface See: https://confluence.oceanobservatories.org/display/syseng/CIAD+CEI+OV+Elastic+Computing """ def __init__(self): configs = ["service", "epumanagement"] config_files = get_config_paths(configs) self.CFG = bootstrap.configure(config_files) self.dashi = bootstrap.dashi_connect(self.CFG.epumanagement.service_name, self.CFG) self.default_user = self.CFG.epumanagement.get('default_user') # TODO: create ION class here or depend on epuagent repo as a dep ou_client = MockOUAgentClient() statsd_cfg = self.CFG.get('statsd') if 'mock_provisioner' in self.CFG.epumanagement and \ self.CFG.epumanagement['mock_provisioner']: prov_client = MockProvisionerClient() else: provisioner_topic = self.CFG.epumanagement.provisioner_service_name prov_client = ProvisionerClient(self.dashi, topic=provisioner_topic, statsd_cfg=statsd_cfg, client_name="epumanagement") self.service_name = self.CFG.epumanagement.get(EPUM_INITIALCONF_SERVICE_NAME, EPUM_DEFAULT_SERVICE_NAME) self.proc_name = self.CFG.epumanagement.get(EPUM_INITIALCONF_PROC_NAME, None) self.store = get_epum_store(self.CFG, service_name=self.service_name, proc_name=self.proc_name) self.store.initialize() dtrs_client = DTRSClient(self.dashi, statsd_cfg=statsd_cfg, client_name=self.CFG.epumanagement.service_name) self.epumanagement = EPUManagement(self.CFG.epumanagement, SubscriberNotifier(self.dashi), prov_client, ou_client, dtrs_client, store=self.store, statsd_cfg=statsd_cfg) # hack to inject epum reference for mock prov client if isinstance(prov_client, MockProvisionerClient): prov_client._set_epum(self.epumanagement) def start(self): epu.dashiproc.link_dashi_exceptions(self.dashi) self.dashi.handle(self.subscribe_domain) self.dashi.handle(self.unsubscribe_domain) self.dashi.handle(self.add_domain) self.dashi.handle(self.remove_domain) self.dashi.handle(self.list_domains) self.dashi.handle(self.describe_domain) self.dashi.handle(self.reconfigure_domain) self.dashi.handle(self.add_domain_definition) self.dashi.handle(self.remove_domain_definition) self.dashi.handle(self.list_domain_definitions) self.dashi.handle(self.describe_domain_definition) self.dashi.handle(self.update_domain_definition) self.dashi.handle(self.ou_heartbeat) self.dashi.handle(self.instance_info) # this may spawn some background threads self.epumanagement.initialize() # hack to load some domain definitions at boot. later this should be client driven. initial_definitions = self.CFG.epumanagement.initial_definitions for definition_id, definition in initial_definitions.iteritems(): log.info("Loading Domain Definition %s", definition_id) try: self.epumanagement.msg_add_domain_definition(definition_id, definition) except WriteConflictError: log.warn("Conflict while loading domain definition. It probably exists.", exc_info=True) except Exception: log.exception("Failed to load Domain Definition %s", definition_id) # hack to load some domains at boot. later this should be client driven. initial_domains = self.CFG.epumanagement.initial_domains for domain_id, params in initial_domains.iteritems(): log.info("Loading Domain %s", domain_id) definition_id = params['definition'] config = params['config'] try: self.epumanagement.msg_add_domain(self.default_user, domain_id, definition_id, config) except WriteConflictError: log.warn("Conflict while loading domain definition. It probably exists.", exc_info=True) except Exception: log.exception("Failed to load Domain %s", domain_id) # blocks til dashi.cancel() is called self.dashi.consume() @property def default_user(self): if not self._default_user: msg = "Operation called for the default user, but none is defined." raise UserNotPermittedError(msg) else: return self._default_user @default_user.setter # noqa def default_user(self, default_user): self._default_user = default_user def subscribe_domain(self, domain_id, subscriber_name, subscriber_op, caller=None): caller = caller or self.default_user self.epumanagement.msg_subscribe_domain(caller, domain_id, subscriber_name, subscriber_op) def unsubscribe_domain(self, domain_id, subscriber_name, caller=None): caller = caller or self.default_user self.epumanagement.msg_unsubscribe_domain(caller, domain_id, subscriber_name) def list_domains(self, caller=None): """Return a list of domains in the system """ caller = caller or self.default_user return self.epumanagement.msg_list_domains(caller=caller) def describe_domain(self, domain_id, caller=None): """Return a state structure for a domain, or None """ caller = caller or self.default_user return self.epumanagement.msg_describe_domain(caller, domain_id) def add_domain(self, domain_id, definition_id, config, subscriber_name=None, subscriber_op=None, caller=None): caller = caller or self.default_user self.epumanagement.msg_add_domain(caller, domain_id, definition_id, config, subscriber_name=subscriber_name, subscriber_op=subscriber_op) def remove_domain(self, domain_id, caller=None): caller = caller or self.default_user self.epumanagement.msg_remove_domain(caller, domain_id) def reconfigure_domain(self, domain_id, config, caller=None): caller = caller or self.default_user self.epumanagement.msg_reconfigure_domain(caller, domain_id, config) def list_domain_definitions(self): return self.epumanagement.msg_list_domain_definitions() def describe_domain_definition(self, definition_id): return self.epumanagement.msg_describe_domain_definition(definition_id) def add_domain_definition(self, definition_id, definition): self.epumanagement.msg_add_domain_definition(definition_id, definition) def remove_domain_definition(self, definition_id): self.epumanagement.msg_remove_domain_definition(definition_id) def update_domain_definition(self, definition_id, definition): self.epumanagement.msg_update_domain_definition(definition_id, definition) def ou_heartbeat(self, heartbeat): self.epumanagement.msg_heartbeat(None, heartbeat) # epum parses def instance_info(self, record): self.epumanagement.msg_instance_info(None, record) # epum parses
class SubscriberTests(unittest.TestCase): def setUp(self): # Mock mode: initial_conf = {EPUM_INITIALCONF_EXTERNAL_DECIDE: True} self.notifier = MockSubscriberNotifier() self.provisioner_client = MockProvisionerClient() self.dtrs_client = MockDTRSClient() self.ou_client = MockOUAgentClient() self.epum_store = LocalEPUMStore(EPUM_DEFAULT_SERVICE_NAME) self.epum_store.initialize() self.epum = EPUManagement( initial_conf, self.notifier, self.provisioner_client, self.ou_client, self.dtrs_client, store=self.epum_store) # For instance-state changes "from the provisioner" self.provisioner_client._set_epum(self.epum) # For heartbeats "from the OU instance" self.ou_client._set_epum(self.epum) def _get_simplest_domain_definition(self): engine_class = "epu.decisionengine.impls.simplest.SimplestEngine" general = {EPUM_CONF_ENGINE_CLASS: engine_class} health = {EPUM_CONF_HEALTH_MONITOR: False} return {EPUM_CONF_GENERAL: general, EPUM_CONF_HEALTH: health} def _config_simplest_domainconf(self, n_preserving, dt="00_dt_id"): """Get 'simplest' domain conf with specified NPreserving policy """ engine = {CONF_PRESERVE_N: n_preserving, "epuworker_type": dt} return {EPUM_CONF_ENGINE: engine} def _reset(self): self.notifier.notify_by_name_called = 0 self.notifier.receiver_names = [] self.notifier.operations = [] self.notifier.messages = [] def _mock_checks(self, num_called, idx_check, subscriber_name, subscriber_op, expected_state, expected_domain): self.assertEqual(self.notifier.notify_by_name_called, num_called) self.assertEqual(len(self.notifier.receiver_names), num_called) self.assertEqual(len(self.notifier.operations), num_called) self.assertEqual(len(self.notifier.messages), num_called) self.assertEqual(self.notifier.receiver_names[idx_check], subscriber_name) self.assertEqual(self.notifier.operations[idx_check], subscriber_op) self.assertTrue("state" in self.notifier.messages[idx_check]) self.assertEqual(self.notifier.messages[idx_check]["state"], expected_state) self.assertEqual(self.notifier.messages[idx_check]["domain_id"], expected_domain) def test_ignore_subscriber(self): self._reset() self.epum.initialize() self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 0) definition_id = "definition1" definition = self._get_simplest_domain_definition() self.epum.msg_add_domain_definition(definition_id, definition) self.epum.msg_add_domain("owner", "domain1", definition_id, self._config_simplest_domainconf(1)) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 1) self.assertEqual(len(self.provisioner_client.launched_instance_ids), 1) self.assertEqual(len(self.provisioner_client.deployable_types_launched), 1) self.assertEqual(self.provisioner_client.deployable_types_launched[0], "00_dt_id") self.assertEqual(self.notifier.notify_by_name_called, 0) # Simulate provisioner content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.RUNNING} self.epum.msg_instance_info(None, content) self.assertEqual(self.notifier.notify_by_name_called, 0) def test_one_subscriber(self): subscriber_name = "subscriber01_name" subscriber_op = "subscriber01_op" self._reset() self.epum.initialize() self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 0) self.assertEqual(self.provisioner_client.provision_count, 0) definition_id = "definition1" definition = self._get_simplest_domain_definition() self.epum.msg_add_domain_definition(definition_id, definition) self.epum.msg_add_domain("owner", "domain1", definition_id, self._config_simplest_domainconf(1)) self.epum.msg_subscribe_domain("owner", "domain1", subscriber_name, subscriber_op) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 1) self.assertEqual(len(self.provisioner_client.launched_instance_ids), 1) self.assertEqual(len(self.provisioner_client.deployable_types_launched), 1) self.assertEqual(self.notifier.notify_by_name_called, 0) # Simulate provisioner content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.STARTED} self.epum.msg_instance_info(None, content) self.assertEqual(self.notifier.notify_by_name_called, 0) # Running signal should be first notification content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.RUNNING} self.epum.msg_instance_info(None, content) self._mock_checks(1, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1") def test_multiple_subscribers(self): subscriber_name = "subscriber01_name" subscriber_op = "subscriber01_op" subscriber2_name = "subscriber02_name" subscriber2_op = "subscriber02_op" subscriber3_name = "subscriber03_name" subscriber3_op = "subscriber03_op" self._reset() self.epum.initialize() self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 0) definition_id = "definition1" definition = self._get_simplest_domain_definition() self.epum.msg_add_domain_definition(definition_id, definition) self.epum.msg_add_domain("owner", "domain1", definition_id, self._config_simplest_domainconf(1)) self.epum.msg_subscribe_domain("owner", "domain1", subscriber_name, subscriber_op) self.epum.msg_subscribe_domain("owner", "domain1", subscriber2_name, subscriber2_op) self.epum.msg_subscribe_domain("owner", "domain1", subscriber3_name, subscriber3_op) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 1) self.assertEqual(len(self.provisioner_client.launched_instance_ids), 1) self.assertEqual(len(self.provisioner_client.deployable_types_launched), 1) self.assertEqual(self.provisioner_client.deployable_types_launched[0], "00_dt_id") self.assertEqual(self.notifier.notify_by_name_called, 0) # Simulate provisioner content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.STARTED} self.epum.msg_instance_info(None, content) self.assertEqual(self.notifier.notify_by_name_called, 0) # Running signal should be first notification content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.RUNNING} self.epum.msg_instance_info(None, content) self._mock_checks(3, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1") self._mock_checks(3, 1, subscriber2_name, subscriber2_op, InstanceState.RUNNING, "domain1") self._mock_checks(3, 2, subscriber3_name, subscriber3_op, InstanceState.RUNNING, "domain1") def test_multiple_subscribers_multiple_domains(self): """Three subscribers, two for one domain, one for another. One VM for each domain. """ subscriber_name = "subscriber01_name" subscriber_op = "subscriber01_op" subscriber2_name = "subscriber02_name" subscriber2_op = "subscriber02_op" subscriber3_name = "subscriber03_name" subscriber3_op = "subscriber03_op" self._reset() self.epum.initialize() self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 0) definition_id = "definition1" definition = self._get_simplest_domain_definition() self.epum.msg_add_domain_definition(definition_id, definition) self.epum.msg_add_domain("owner", "domain1", definition_id, self._config_simplest_domainconf(1)) self.epum.msg_subscribe_domain("owner", "domain1", subscriber_name, subscriber_op) self.epum.msg_subscribe_domain("owner", "domain1", subscriber2_name, subscriber2_op) # Subscriber 3 is for a different domain self.epum.msg_add_domain("owner", "domain2", definition_id, self._config_simplest_domainconf(1, dt="01_dt_id")) self.epum.msg_subscribe_domain("owner", "domain2", subscriber3_name, subscriber3_op) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 2) self.assertEqual(len(self.provisioner_client.launched_instance_ids), 2) self.assertEqual(len(self.provisioner_client.deployable_types_launched), 2) # Find out which order these were launched ... subscriber3_index = -1 for i, dt_id in enumerate(self.provisioner_client.deployable_types_launched): if dt_id == "01_dt_id": subscriber3_index = i self.assertNotEqual(subscriber3_index, -1) # Now we know which was provisioned first... give opposite index to other one if subscriber3_index: subscriber1and2_index = 0 else: subscriber1and2_index = 1 self.assertEqual(self.provisioner_client.deployable_types_launched[subscriber1and2_index], "00_dt_id") self.assertEqual(self.provisioner_client.deployable_types_launched[subscriber3_index], "01_dt_id") # No notifications until RUNNING self.assertEqual(self.notifier.notify_by_name_called, 0) # Simulate provisioner update for BOTH VMs launched content = {"node_id": self.provisioner_client.launched_instance_ids[subscriber1and2_index], "state": InstanceState.STARTED} self.epum.msg_instance_info(None, content) content = {"node_id": self.provisioner_client.launched_instance_ids[subscriber3_index], "state": InstanceState.STARTED} self.epum.msg_instance_info(None, content) self.assertEqual(self.notifier.notify_by_name_called, 0) # Running signal should be first notification, send RUNNING just for 01_dt_id instance (subscriber 3) content = {"node_id": self.provisioner_client.launched_instance_ids[subscriber3_index], "state": InstanceState.RUNNING} self.epum.msg_instance_info(None, content) self._mock_checks(1, 0, subscriber3_name, subscriber3_op, InstanceState.RUNNING, "domain2") # Now for 00_dt_id instance (subscribers 1 and 2) content = {"node_id": self.provisioner_client.launched_instance_ids[subscriber1and2_index], "state": InstanceState.RUNNING} self.epum.msg_instance_info(None, content) self._mock_checks(3, 1, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1") self._mock_checks(3, 2, subscriber2_name, subscriber2_op, InstanceState.RUNNING, "domain1") def _fail_setup(self): subscriber_name = "subscriber01_name" subscriber_op = "subscriber01_op" self._reset() self.epum.initialize() self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 0) definition_id = "definition1" definition = self._get_simplest_domain_definition() self.epum.msg_add_domain_definition(definition_id, definition) self.epum.msg_add_domain("owner", "domain1", definition_id, self._config_simplest_domainconf(1)) self.epum.msg_subscribe_domain("owner", "domain1", subscriber_name, subscriber_op) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 1) self.assertEqual(len(self.provisioner_client.launched_instance_ids), 1) self.assertEqual(len(self.provisioner_client.deployable_types_launched), 1) self.assertEqual(self.provisioner_client.deployable_types_launched[0], "00_dt_id") self.assertEqual(self.notifier.notify_by_name_called, 0) # Simulate provisioner content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.STARTED} self.epum.msg_instance_info(None, content) self.assertEqual(self.notifier.notify_by_name_called, 0) # Running signal should be first notification content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.RUNNING} self.epum.msg_instance_info(None, content) # The "test_fail*" methods are for checking on notifications after RUNNING. If the provisioner # doesn't 'increase' states, EPUM throws them out, no need to test that scenario. def test_fail_650(self): subscriber_name = "subscriber01_name" subscriber_op = "subscriber01_op" self._fail_setup() self._mock_checks(1, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1") # Failing content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.RUNNING_FAILED} self.epum.msg_instance_info(None, content) # All non-RUNNING notifications should be FAILED self._mock_checks(2, 1, subscriber_name, subscriber_op, InstanceState.FAILED, "domain1") def test_fail_700(self): subscriber_name = "subscriber01_name" subscriber_op = "subscriber01_op" self._fail_setup() self._mock_checks(1, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1") # Failing content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.TERMINATING} self.epum.msg_instance_info(None, content) # All non-RUNNING notifications should be FAILED self._mock_checks(2, 1, subscriber_name, subscriber_op, InstanceState.FAILED, "domain1") def test_fail_800(self): subscriber_name = "subscriber01_name" subscriber_op = "subscriber01_op" self._fail_setup() self._mock_checks(1, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1") # Failing content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.TERMINATED} self.epum.msg_instance_info(None, content) # All non-RUNNING notifications should be FAILED self._mock_checks(2, 1, subscriber_name, subscriber_op, InstanceState.FAILED, "domain1") def test_fail_900(self): subscriber_name = "subscriber01_name" subscriber_op = "subscriber01_op" self._fail_setup() self._mock_checks(1, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1") # Failing content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.FAILED} self.epum.msg_instance_info(None, content) # All non-RUNNING notifications should be FAILED self._mock_checks(2, 1, subscriber_name, subscriber_op, InstanceState.FAILED, "domain1") def test_updated_node_ip(self): subscriber_name = "subscriber01_name" subscriber_op = "subscriber01_op" self._reset() self.epum.initialize() self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 0) definition_id = "definition1" definition = self._get_simplest_domain_definition() self.epum.msg_add_domain_definition(definition_id, definition) self.epum.msg_add_domain("owner", "domain1", definition_id, self._config_simplest_domainconf(1)) self.epum.msg_subscribe_domain("owner", "domain1", subscriber_name, subscriber_op) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 1) self.assertEqual(len(self.provisioner_client.launched_instance_ids), 1) self.assertEqual(len(self.provisioner_client.deployable_types_launched), 1) self.assertEqual(self.notifier.notify_by_name_called, 0) domain = self.epum_store.get_domain("owner", "domain1") content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.STARTED, "update_counter": 1} self.epum.msg_instance_info(None, content) content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.RUNNING, "public_ip": "vm-1234", "update_counter": 2} self.epum.msg_instance_info(None, content) self._mock_checks(1, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1") self.assertEqual(domain.get_instance(self.provisioner_client.launched_instance_ids[0]).public_ip, "vm-1234") content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.RUNNING, "public_ip": "1.2.3.4", "update_counter": 3} self.epum.msg_instance_info(None, content) self._mock_checks(2, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1") self.assertEqual(domain.get_instance(self.provisioner_client.launched_instance_ids[0]).public_ip, "1.2.3.4") # Check that sequential update_counter is respected content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.RUNNING, "public_ip": "localhost", "update_counter": 2} self.epum.msg_instance_info(None, content) self._mock_checks(2, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1") self.assertEqual(domain.get_instance(self.provisioner_client.launched_instance_ids[0]).public_ip, "1.2.3.4") # A state going backwards should not happen, but double-check content = {"node_id": self.provisioner_client.launched_instance_ids[0], "state": InstanceState.STARTED, "public_ip": "localhost", "update_counter": 4} self.epum.msg_instance_info(None, content) self._mock_checks(2, 0, subscriber_name, subscriber_op, InstanceState.RUNNING, "domain1") self.assertEqual(domain.get_instance(self.provisioner_client.launched_instance_ids[0]).public_ip, "1.2.3.4")
class EPUManagementBasicTests(unittest.TestCase): """ Tests that cover basic things like running a decision engine cycle and making sure a VM is requested, etc. """ def setUp(self): # Mock mode: initial_conf = {EPUM_INITIALCONF_EXTERNAL_DECIDE: True} self.notifier = MockSubscriberNotifier() self.provisioner_client = MockProvisionerClient() self.ou_client = MockOUAgentClient() self.dtrs_client = MockDTRSClient() self.epum_store = LocalEPUMStore(EPUM_DEFAULT_SERVICE_NAME) self.epum_store.initialize() self.epum = EPUManagement( initial_conf, self.notifier, self.provisioner_client, self.ou_client, self.dtrs_client, store=self.epum_store) # For instance-state changes "from the provisioner" self.provisioner_client._set_epum(self.epum) # For heartbeats "from the OU instance" self.ou_client._set_epum(self.epum) def _config_mock1(self): """Keeps increment count """ engine = {CONF_PRESERVE_N: 1} return {EPUM_CONF_ENGINE: engine} def _definition_mock1(self): general = {EPUM_CONF_ENGINE_CLASS: MOCK_PKG + ".MockDecisionEngine01"} health = {EPUM_CONF_HEALTH_MONITOR: False} return {EPUM_CONF_GENERAL: general, EPUM_CONF_HEALTH: health} def _definition_mock2(self): """decide and reconfigure fail """ definition = self._definition_mock1() definition[EPUM_CONF_GENERAL] = {EPUM_CONF_ENGINE_CLASS: MOCK_PKG + ".MockDecisionEngine02"} return definition def _definition_mock3(self): """uses Deferred """ definition = self._definition_mock1() definition[EPUM_CONF_GENERAL] = {EPUM_CONF_ENGINE_CLASS: MOCK_PKG + ".MockDecisionEngine03"} return definition def _get_simplest_domain_definition(self): engine_class = "epu.decisionengine.impls.simplest.SimplestEngine" general = {EPUM_CONF_ENGINE_CLASS: engine_class} health = {EPUM_CONF_HEALTH_MONITOR: False} return {EPUM_CONF_GENERAL: general, EPUM_CONF_HEALTH: health} def _config_simplest_domainconf(self, n_preserving): """Get 'simplest' domain conf with specified NPreserving policy """ engine = {CONF_PRESERVE_N: n_preserving} return {EPUM_CONF_ENGINE: engine} def _config_simplest_chef_domainconf(self, n_preserving, chef_credential): """Get 'simplest' domain conf with specified NPreserving policy """ engine = {CONF_PRESERVE_N: n_preserving} general = {EPUM_CONF_CHEF_CREDENTIAL: chef_credential} return {EPUM_CONF_ENGINE: engine, EPUM_CONF_GENERAL: general} def _get_sensor_domain_definition(self): engine_class = "epu.decisionengine.impls.sensor.SensorEngine" general = {EPUM_CONF_ENGINE_CLASS: engine_class} health = {EPUM_CONF_HEALTH_MONITOR: False} return {EPUM_CONF_GENERAL: general, EPUM_CONF_HEALTH: health} def _config_sensor_domainconf(self, minimum_n): """Get 'sensor' domain conf with mock aggregator """ engine = {CONF_SENSOR_TYPE: 'mockcloudwatch', CONF_IAAS_SITE: 'fake', CONF_IAAS_ALLOCATION: 'also.fake', 'deployable_type': 'fake', 'minimum_vms': minimum_n, 'metric': 'load', 'monitor_sensors': ['load', ], 'monitor_domain_sensors': ['queuelen', ], 'sample_function': 'Average'} return {EPUM_CONF_ENGINE: engine} def test_engine_decide(self): """ Verify decide is called at expected time """ self.epum.initialize() definition = self._definition_mock1() config = self._config_mock1() owner = "owner1" domain_id = "testing123" definition_id = "def123" self.epum.msg_add_domain_definition(definition_id, definition) self.epum.msg_add_domain(owner, domain_id, definition_id, config) self.epum._run_decisions() # digging into internal structure to get engine instances engine = self.epum.decider.engines[(owner, domain_id)] self.assertNotEqual(engine, None) self.assertEqual(engine.initialize_count, 1) self.assertEqual(engine.initialize_conf[CONF_PRESERVE_N], 1) self.assertEqual(engine.decide_count, 1) self.epum._run_decisions() self.assertEqual(engine.decide_count, 2) def _compare_configs(self, c1, c2): self.assertEqual(set(c1.keys()), set(c2.keys())) self.assertEqual(c1[EPUM_CONF_GENERAL], c2[EPUM_CONF_GENERAL]) self.assertEqual(c1[EPUM_CONF_HEALTH], c2[EPUM_CONF_HEALTH]) self.assertEqual(c1[EPUM_CONF_ENGINE], c2[EPUM_CONF_ENGINE]) def test_domain_query(self): """Verify domain query operations work """ self.epum.initialize() caller = "asterix" domain1_definition_name = "onedomaindef" domain1_definition = self._definition_mock1() domain1_config = self._config_mock1() domain1_name = "onedomain" domain2_definition_name = "twodomaindef" domain2_definition = self._get_simplest_domain_definition() domain2_config = self._config_simplest_domainconf(1) domain2_name = "twodomain" domains = self.epum.msg_list_domains(caller) self.assertEqual(domains, []) self.epum.msg_add_domain_definition(domain1_definition_name, domain1_definition) self.epum.msg_add_domain(caller, domain1_name, domain1_definition_name, domain1_config) domains = self.epum.msg_list_domains(caller) self.assertEqual(domains, [domain1_name]) domain1_desc = self.epum.msg_describe_domain(caller, domain1_name) self.assertEqual(domain1_desc['name'], domain1_name) log.debug("domain1 desc: %s", domain1_desc) merged_config = copy.copy(domain1_definition) merged_config.update(domain1_config) self._compare_configs(merged_config, domain1_desc['config']) self.assertEqual(domain1_desc['instances'], []) self.epum.msg_add_domain_definition(domain2_definition_name, domain2_definition) self.epum.msg_add_domain(caller, domain2_name, domain2_definition_name, domain2_config) domains = self.epum.msg_list_domains(caller) self.assertEqual(set(domains), set([domain1_name, domain2_name])) # this will cause domain2 to launch an instance self.epum._run_decisions() domain2_desc = self.epum.msg_describe_domain(caller, domain2_name) self.assertEqual(domain2_desc['name'], domain2_name) merged_config = copy.copy(domain2_definition) merged_config.update(domain2_config) self._compare_configs(merged_config, domain2_desc['config']) self.assertEqual(len(domain2_desc['instances']), 1) # just make sure it looks roughly like a real instance instance = domain2_desc['instances'][0] self.assertIn("instance_id", instance) self.assertIn("state", instance) def test_sensor_data(self): self.epum.initialize() caller = "asterix" domain_definition_name = "twodomaindef" domain_definition = self._get_sensor_domain_definition() domain_config = self._config_sensor_domainconf(1) domain_name = "twodomain" domains = self.epum.msg_list_domains(caller) self.assertEqual(domains, []) self.epum.msg_add_domain_definition(domain_definition_name, domain_definition) self.epum.msg_add_domain(caller, domain_name, domain_definition_name, domain_config) domains = self.epum.msg_list_domains(caller) self.assertEqual(domains, [domain_name]) domain_desc = self.epum.msg_describe_domain(caller, domain_name) self.assertEqual(domain_desc['name'], domain_name) log.debug("domain desc: %s", domain_desc) merged_config = copy.copy(domain_definition) merged_config.update(domain_config) self._compare_configs(merged_config, domain_desc['config']) self.assertEqual(domain_desc['instances'], []) # this will cause domain to launch an instance self.epum._run_decisions() domain_desc = self.epum.msg_describe_domain(caller, domain_name) self.assertEqual(domain_desc['name'], domain_name) merged_config = copy.copy(domain_definition) merged_config.update(domain_config) self._compare_configs(merged_config, domain_desc['config']) self.assertEqual(len(domain_desc['instances']), 1) # just make sure it looks roughly like a real instance instance = domain_desc['instances'][0] self.assertIn("instance_id", instance) self.assertIn("state", instance) self.assertNotIn("sensor_data", instance) self.epum._run_decisions() domain_desc = self.epum.msg_describe_domain(caller, domain_name) self.assertEqual(domain_desc['name'], domain_name) merged_config = copy.copy(domain_definition) merged_config.update(domain_config) self._compare_configs(merged_config, domain_desc['config']) self.assertEqual(len(domain_desc['instances']), 1) # just make sure it now has sensor_data self.assertIn("sensor_data", domain_desc) self.assertIn("queuelen", domain_desc['sensor_data']) self.assertIn(Statistics.SERIES, domain_desc['sensor_data']['queuelen']) instance = domain_desc['instances'][0] self.assertIn("instance_id", instance) self.assertIn("state", instance) self.assertIn("sensor_data", instance) self.assertIn("load", instance['sensor_data']) self.assertIn(Statistics.SERIES, instance['sensor_data']['load']) def test_engine_reconfigure(self): """ Verify reconfigure is called after a 'worker' alters the domain config """ self.epum.initialize() domain_definition = self._definition_mock1() domain_config = self._config_mock1() owner = "emily" definition_id = "def123" domain_name1 = "testing123" domain_name2 = "testing789" self.epum.msg_add_domain_definition(definition_id, domain_definition) self.epum.msg_add_domain(owner, domain_name1, definition_id, domain_config) self.epum.msg_add_domain(owner, domain_name2, definition_id, domain_config) self.epum._run_decisions() # digging into internal structure to get engine instances domain_engine1 = self.epum.decider.engines[(owner, domain_name1)] domain_engine2 = self.epum.decider.engines[(owner, domain_name2)] self.assertEqual(domain_engine1.decide_count, 1) self.assertEqual(domain_engine2.decide_count, 1) # reconfigure test self.assertEqual(domain_engine1.reconfigure_count, 0) self.assertEqual(domain_engine2.reconfigure_count, 0) domain_config2 = {EPUM_CONF_ENGINE: {CONF_PRESERVE_N: 2}} self.epum.msg_reconfigure_domain(owner, domain_name1, domain_config2) # should not take effect immediately, a reconfigure is external msg handled by reactor worker self.assertEqual(domain_engine1.reconfigure_count, 0) self.assertEqual(domain_engine2.reconfigure_count, 0) self.epum._run_decisions() # now it should have happened, after a decision cycle, but only to domain_name1 self.assertEqual(domain_engine1.reconfigure_count, 1) self.assertEqual(domain_engine2.reconfigure_count, 0) # should not happen again self.epum._run_decisions() self.assertEqual(domain_engine1.reconfigure_count, 1) self.assertEqual(domain_engine2.reconfigure_count, 0) def test_basic_npreserving(self): """ Create one domain with NPreserving=2 policy. Verify two instances are launched on the first decision cycle. """ self.epum.initialize() domain_config = self._config_simplest_domainconf(2) definition = {} self.epum.msg_add_domain_definition("definition1", definition) self.epum.msg_add_domain("owner1", "testing123", "definition1", domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 2) def test_basic_chef_domain(self): self.epum.initialize() domain_config = self._config_simplest_chef_domainconf(2, "chef1") definition = {} self.epum.msg_add_domain_definition("definition1", definition) self.epum.msg_add_domain("owner1", "testing123", "definition1", domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 2) # ensure chef credential name is passed through in provisioner vars self.assertEqual(self.provisioner_client.launches[0]['vars']['chef_credential'], 'chef1') self.assertEqual(self.provisioner_client.launches[1]['vars']['chef_credential'], 'chef1') def test_reconfigure_npreserving(self): """ Create one domain with NPreserving=2 policy. Verify two instances are launched on the first decision cycle. Reconfigure with NPreserving=4 policy. Verify two more instances are launched on next decision cycle. Reconfigure with NPreserving=0 policy. Verify four instances are terminated on next decision cycle. """ self.epum.initialize() owner = "opwner1" definition_id = "def123" definition = self._get_simplest_domain_definition() domain_name = "testing123" domain_config = self._config_simplest_domainconf(2) self.epum.msg_add_domain_definition(definition_id, definition) self.epum.msg_add_domain(owner, domain_name, definition_id, domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 2) self.assertEqual(self.provisioner_client.terminate_node_count, 0) domain_config = self._config_simplest_domainconf(4) self.epum.msg_reconfigure_domain(owner, domain_name, domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 4) self.assertEqual(self.provisioner_client.terminate_node_count, 0) domain_config = self._config_simplest_domainconf(0) self.epum.msg_reconfigure_domain(owner, domain_name, domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 4) self.assertEqual(self.provisioner_client.terminate_node_count, 4) def test_decider_leader_disable(self): """ Create one domain with NPreserving=2 policy. Verify two instances are launched on the first decision cycle. Change to NPreserving=1, verify that one is terminated on second decision cycle Disable leader via epum internals Change to NPreserving=4, verify that nothing happened. Enable leader via epum internals Previous reconfiguration will be recognized This will only work in this in-memory situation, otherwise another EPUM worker becomes the decider and will respond to reconfigurations. """ self.epum.initialize() definition_name = "def123" domain_definition = self._get_simplest_domain_definition() owner = "opwner1" domain_name = "testing123" domain_config = self._config_simplest_domainconf(2) self.epum.msg_add_domain_definition(definition_name, domain_definition) self.epum.msg_add_domain(owner, domain_name, definition_name, domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 2) self.assertEqual(self.provisioner_client.terminate_node_count, 0) domain_config = self._config_simplest_domainconf(1) self.epum.msg_reconfigure_domain(owner, domain_name, domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 2) self.assertEqual(self.provisioner_client.terminate_node_count, 1) # digging into internal structure to disable leader self.epum.epum_store._change_decider(False) # nothing should happen now, should stay provision=2, terminate=1 domain_config = self._config_simplest_domainconf(4) self.epum.msg_reconfigure_domain(owner, domain_name, domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 2) self.assertEqual(self.provisioner_client.terminate_node_count, 1) # digging into internal structure to enable leader self.epum.epum_store._change_decider(True) # previous reconfiguration (preserve 4) should be recognized if decision cycle runs self.epum._run_decisions() # 3 more provisions to take from N=1 to N=4 (making 5 total provisions) self.assertEqual(self.provisioner_client.provision_count, 5) self.assertEqual(self.provisioner_client.terminate_node_count, 1) def test_instance_lookup(self): """ Create two domains, run NPreserving=1 in each of them. Lookup by instance_id and make sure the right domain is returned to the caller. Some incoming service messages, like heartbeats, only have the instance_id to go on (not which domain it belongs to). """ self.epum.initialize() definition_id = "definition1" definition = self._get_simplest_domain_definition() domain_config = self._config_simplest_domainconf(1) owner = "owner1" domain_name1 = "domain1" domain_name2 = "domain2" self.epum.msg_add_domain_definition(definition_id, definition) self.epum.msg_add_domain(owner, domain_name1, definition_id, domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 1) self.assertEqual(len(self.provisioner_client.launched_instance_ids), 1) via_domain1 = self.provisioner_client.launched_instance_ids[0] self.epum.msg_add_domain(owner, domain_name2, definition_id, domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 2) self.assertEqual(len(self.provisioner_client.launched_instance_ids), 2) via_domain2 = self.provisioner_client.launched_instance_ids[1] domain1 = self.epum.epum_store.get_domain_for_instance_id(via_domain1) domain2 = self.epum.epum_store.get_domain_for_instance_id(via_domain2) self.assertEqual(domain1.domain_id, domain_name1) self.assertEqual(domain2.domain_id, domain_name2) def test_decider_retries(self): self.epum.initialize() definition_id = "definition1" definition = self._get_simplest_domain_definition() domain_config = self._config_simplest_domainconf(2) owner = "owner1" domain_name = "domain1" self.epum.msg_add_domain_definition(definition_id, definition) self.epum.msg_add_domain(owner, domain_name, definition_id, domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 2) self.assertEqual(len(self.provisioner_client.launched_instance_ids), 2) # sneak into decider internals and patch out retry interval, to speed test for controls in self.epum.decider.controls.values(): controls._retry_seconds = 0.5 # rerun decisions. no retries should happen self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 2) self.assertEqual(len(self.provisioner_client.launched_instance_ids), 2) # provide REQUESTED state for first instance. should not retried self.provisioner_client.report_node_state( InstanceState.REQUESTED, self.provisioner_client.launched_instance_ids[0]) # wait until a retry should be expected time.sleep(0.6) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 3) self.assertEqual(len(set(self.provisioner_client.launched_instance_ids)), 2) self.assertEqual(self.provisioner_client.launched_instance_ids[1], self.provisioner_client.launched_instance_ids[2]) # now kill the instances. domain_config = self._config_simplest_domainconf(0) self.epum.msg_reconfigure_domain(owner, domain_name, domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 3) self.assertEqual(self.provisioner_client.terminate_node_count, 2) self.assertEqual(len(self.provisioner_client.terminated_instance_ids), 2) # should be no retries immediately self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 3) self.assertEqual(self.provisioner_client.terminate_node_count, 2) self.assertEqual(len(self.provisioner_client.terminated_instance_ids), 2) # provide TERMINATED state for first instance. should not retried self.provisioner_client.report_node_state( InstanceState.TERMINATED, self.provisioner_client.terminated_instance_ids[0]) # wait until a retry should be expected time.sleep(0.6) self.epum._run_decisions() self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 3) self.assertEqual(self.provisioner_client.terminate_node_count, 3) self.assertEqual(len(self.provisioner_client.terminated_instance_ids), 3) self.assertEqual(self.provisioner_client.terminated_instance_ids[1], self.provisioner_client.terminated_instance_ids[2]) def test_failing_engine_decide(self): """Exceptions during decide cycle should not affect EPUM. """ self.epum.initialize() fail_definition = self._definition_mock2() fail_definition_id = "fail_definition" config = self._config_mock1() self.epum.msg_add_domain_definition(fail_definition_id, fail_definition) self.epum.msg_add_domain("joeowner", "fail_domain", fail_definition_id, config) self.epum._run_decisions() # digging into internal structure to get engine instance domain_engine = self.epum.decider.engines[("joeowner", "fail_domain")] self.assertEqual(domain_engine.decide_count, 1) def test_failing_engine_reconfigure(self): """Exceptions during engine reconfigure should not affect EPUM. """ self.epum.initialize() fail_definition = self._definition_mock2() fail_definition_id = "fail_definition" config = self._config_mock1() self.epum.msg_add_domain_definition(fail_definition_id, fail_definition) self.epum.msg_add_domain("owner", "fail_domain", fail_definition_id, config) self.epum._run_decisions() # digging into internal structure to get engine instance domain_engine = self.epum.decider.engines[("owner", "fail_domain")] self.assertEqual(domain_engine.decide_count, 1) self.assertEqual(domain_engine.reconfigure_count, 0) config2 = {EPUM_CONF_ENGINE: {CONF_PRESERVE_N: 2}} self.epum.msg_reconfigure_domain("owner", "fail_domain", config2) self.epum._run_decisions() self.assertEqual(domain_engine.decide_count, 2) self.assertEqual(domain_engine.reconfigure_count, 1) def test_remove_domain(self): """ Ensure instances are killed when domain is removed """ self.epum.initialize() domain_config = self._config_simplest_domainconf(2) definition_id = "def123" definition = self._get_simplest_domain_definition() self.epum.msg_add_domain_definition(definition_id, definition) self.epum.msg_add_domain("owner1", "testing123", definition_id, domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 2) self.epum.msg_remove_domain("owner1", "testing123") self.epum._run_decisions() self.assertEqual(self.provisioner_client.terminate_node_count, 2) def test_multiuser(self): """Ensure that multiuser checks are working """ permitted_user = "******" disallowed_user = "******" self.epum.initialize() # TODO: test adding with a dt that user doesn't own definition_id = "def123" definition = self._definition_mock1() domain_config = self._config_mock1() domain_name = "testing123" self.epum.msg_add_domain_definition(definition_id, definition) self.epum.msg_add_domain(permitted_user, domain_name, definition_id, domain_config) # Test describe not_found_error = False try: self.epum.msg_describe_domain(disallowed_user, domain_name) except NotFoundError: not_found_error = True msg = "Non-permitted user was able to describe an domain he didn't own!" self.assertTrue(not_found_error, msg) self.epum.msg_describe_domain(permitted_user, domain_name) # Test list disallowed_domains = self.epum.msg_list_domains(disallowed_user) self.assertEqual(len(disallowed_domains), 0) permitted_domains = self.epum.msg_list_domains(permitted_user) self.assertEqual(len(permitted_domains), 1) # Test reconfigure new_config = {} not_found_error = False try: self.epum.msg_reconfigure_domain(disallowed_user, domain_name, new_config) except NotFoundError: not_found_error = True msg = "Non-permitted user was able to reconfigure an domain he didn't own!" self.assertTrue(not_found_error, msg) self.epum.msg_reconfigure_domain(permitted_user, domain_name, new_config) # TODO: test adding with a dt that user doesn't own # Test Remove not_found_error = False try: self.epum.msg_remove_domain(disallowed_user, domain_name) except NotFoundError: not_found_error = True msg = "Non-permitted user was able to remove an domain he didn't own!" self.assertTrue(not_found_error, msg) self.epum.msg_remove_domain(permitted_user, domain_name) def test_definitions(self): self.epum.initialize() definition1_name = "definition1" definition1 = self._definition_mock1() definition2_name = "definition2" definition2 = self._definition_mock2() self.epum.msg_add_domain_definition(definition1_name, definition1) # Trying to add a domain definition with the same name should raise an # exception try: self.epum.msg_add_domain_definition(definition1_name, definition2) except WriteConflictError: pass else: self.fail("expected WriteConflictError") self.epum.msg_add_domain_definition(definition2_name, definition2) definition_one = self.epum.msg_describe_domain_definition(definition1_name) self.assertEqual(definition_one['name'], definition1_name) self.assertEqual(definition_one['definition'], definition1) definition_two = self.epum.msg_describe_domain_definition(definition2_name) self.assertEqual(definition_two['name'], definition2_name) self.assertEqual(definition_two['definition'], definition2) definitions = self.epum.msg_list_domain_definitions() self.assertEqual(len(definitions), 2) self.assertIn(definition1_name, definitions) self.assertIn(definition2_name, definitions) self.epum.msg_remove_domain_definition(definition1_name) try: self.epum.msg_describe_domain_definition(definition1_name) except NotFoundError: pass else: self.fail("expected NotFoundError") try: self.epum.msg_remove_domain_definition(definition1_name) except NotFoundError: pass else: self.fail("expected NotFoundError") self.epum.msg_update_domain_definition(definition2_name, definition1) definition_two = self.epum.msg_describe_domain_definition(definition2_name) self.assertEqual(definition_two['name'], definition2_name) self.assertEqual(definition_two['definition'], definition1) def test_config_validation(self): caller = "asterix" self.epum.initialize() definition_name = "def123" definition = self._get_simplest_domain_definition() wrong_config = {EPUM_CONF_ENGINE: {}} ok_config = self._config_simplest_domainconf(1) self.epum.msg_add_domain_definition(definition_name, definition) # Trying to add a domain using a config with missing parameters should # raise an exception try: self.epum.msg_add_domain(caller, "domain", definition_name, wrong_config) except ValueError: pass else: self.fail("expected ValueError") self.epum.msg_add_domain(caller, "domain", definition_name, ok_config) def test_engine_config_doc(self): self.epum.initialize() definition_name = "def123" definition = self._get_simplest_domain_definition() self.epum.msg_add_domain_definition(definition_name, definition) desc = self.epum.msg_describe_domain_definition(definition_name) self.assertTrue("documentation" in desc) def test_reaper(self): self.epum.initialize() config = self._config_mock1() owner = "owner1" domain_id = "testing123" # inject the FakeState instance directly instead of using msg_add_domain() self.state = FakeDomainStore(owner, domain_id, config) self.epum.epum_store.domains[(owner, domain_id)] = self.state now = time.time() # One running self.state.new_fake_instance_state("n1", InstanceState.RUNNING, now - EPUM_RECORD_REAPING_DEFAULT_MAX_AGE - 1) # Three in terminal state and outdated self.state.new_fake_instance_state( "n2", InstanceState.TERMINATED, now - EPUM_RECORD_REAPING_DEFAULT_MAX_AGE - 1) self.state.new_fake_instance_state("n3", InstanceState.REJECTED, now - EPUM_RECORD_REAPING_DEFAULT_MAX_AGE - 1) self.state.new_fake_instance_state("n4", InstanceState.FAILED, now - EPUM_RECORD_REAPING_DEFAULT_MAX_AGE - 1) # Three in terminal state and not yet outdated self.state.new_fake_instance_state( "n5", InstanceState.TERMINATED, now - EPUM_RECORD_REAPING_DEFAULT_MAX_AGE + 60) self.state.new_fake_instance_state("n6", InstanceState.REJECTED, now - EPUM_RECORD_REAPING_DEFAULT_MAX_AGE + 60) self.state.new_fake_instance_state("n7", InstanceState.FAILED, now - EPUM_RECORD_REAPING_DEFAULT_MAX_AGE + 60) self.epum._run_reaper_loop() instances = self.state.get_instance_ids() self.assertEqual(len(instances), 4) self.assertIn("n1", instances) self.assertIn("n5", instances) self.assertIn("n6", instances) self.assertIn("n7", instances) def test_instance_update_conflict_1(self): self.epum.initialize() domain_config = self._config_simplest_domainconf(1) definition = {} self.epum.msg_add_domain_definition("definition1", definition) self.epum.msg_add_domain("owner1", "testing123", "definition1", domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 1) domain = self.epum_store.get_domain("owner1", "testing123") instance_id = self.provisioner_client.launched_instance_ids[0] self.provisioner_client.launches[0]['launch_id'] sneaky_msg = dict(node_id=instance_id, state=InstanceState.PENDING) # patch in a function that sneaks in an instance record update just # before a requested update. This simulates the case where two EPUM # workers are competing to update the same instance. original_new_instance_state = domain.new_instance_state patch_called = threading.Event() def patched_new_instance_state(content, timestamp=None, previous=None): patch_called.set() # unpatch ourself first so we don't recurse forever domain.new_instance_state = original_new_instance_state domain.new_instance_state(sneaky_msg, previous=previous) return domain.new_instance_state(content, timestamp=timestamp, previous=previous) domain.new_instance_state = patched_new_instance_state # send our "real" update. should get a conflict msg = dict(node_id=instance_id, state=InstanceState.STARTED) self.epum.msg_instance_info("owner1", msg) assert patch_called.is_set() instance = domain.get_instance(instance_id) self.assertEqual(instance.state, InstanceState.STARTED) def test_instance_update_conflict_2(self): self.epum.initialize() domain_config = self._config_simplest_domainconf(1) definition = {} self.epum.msg_add_domain_definition("definition1", definition) self.epum.msg_add_domain("owner1", "testing123", "definition1", domain_config) self.epum._run_decisions() self.assertEqual(self.provisioner_client.provision_count, 1) domain = self.epum_store.get_domain("owner1", "testing123") instance_id = self.provisioner_client.launched_instance_ids[0] self.provisioner_client.launches[0]['launch_id'] sneaky_msg = dict(node_id=instance_id, state=InstanceState.STARTED) # patch in a function that sneaks in an instance record update just # before a requested update. This simulates the case where two EPUM # workers are competing to update the same instance. original_new_instance_state = domain.new_instance_state patch_called = threading.Event() def patched_new_instance_state(content, timestamp=None, previous=None): patch_called.set() # unpatch ourself first so we don't recurse forever domain.new_instance_state = original_new_instance_state domain.new_instance_state(sneaky_msg, previous=previous) return domain.new_instance_state(content, timestamp=timestamp, previous=previous) domain.new_instance_state = patched_new_instance_state # send our "real" update. should get a conflict msg = dict(node_id=instance_id, state=InstanceState.PENDING) self.epum.msg_instance_info(None, msg) assert patch_called.is_set() # in this case the sneaky message (STARTED) should win because it is # the later state instance = domain.get_instance(instance_id) self.assertEqual(instance.state, InstanceState.STARTED)
class HeartbeatMonitorTests(unittest.TestCase): def setUp(self): self.domain_name = "epuX" self.domain_owner = "david" self.domain_key = (self.domain_owner, self.domain_name) config = self._dom_config(health_init_time=100) self.state = FakeDomainStore(self.domain_owner, self.domain_name, config) initial_conf = {EPUM_INITIALCONF_EXTERNAL_DECIDE: True} self.notifier = MockSubscriberNotifier() self.provisioner_client = MockProvisionerClient() self.dtrs_client = MockDTRSClient() self.ou_client = MockOUAgentClient() self.epum_store = LocalEPUMStore(EPUM_DEFAULT_SERVICE_NAME) self.epum_store.initialize() self.epum = EPUManagement( initial_conf, self.notifier, self.provisioner_client, self.ou_client, self.dtrs_client, store=self.epum_store) self.provisioner_client._set_epum(self.epum) self.ou_client._set_epum(self.epum) # inject the FakeState instance directly instead of using msg_add_domain() self.epum.epum_store.domains[(self.domain_owner, self.domain_name)] = self.state def _dom_config(self, health_init_time=0): general = {EPUM_CONF_ENGINE_CLASS: MOCK_PKG + ".MockDecisionEngine01"} health = {EPUM_CONF_HEALTH_MONITOR: True, EPUM_CONF_HEALTH_BOOT: 10, EPUM_CONF_HEALTH_MISSING: 5, EPUM_CONF_HEALTH_ZOMBIE: 10, EPUM_CONF_HEALTH_REALLY_MISSING: 3, TESTCONF_HEALTH_INIT_TIME: health_init_time} engine = {CONF_PRESERVE_N: 1} return {EPUM_CONF_GENERAL: general, EPUM_CONF_ENGINE: engine, EPUM_CONF_HEALTH: health} def test_recovery(self): self.epum.initialize() dom_config = self._dom_config(health_init_time=100) self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, dom_config) nodes = ["n" + str(i + 1) for i in range(7)] n1, n2, n3, n4, n5, n6, n7 = nodes # set up some instances that reached their iaas_state before the # init time (100) # this one has been running for well longer than the missing timeout # and we will have not received a heartbeat. It shouldn't be marked # OUT_OF_CONTACT until more than 5 seconds after the init_time self.state.new_fake_instance_state(n1, InstanceState.RUNNING, 50, InstanceHealthState.OK) # this has been running for 10 seconds before the init time but we # have never received a heartbeat. It should be marked as OUT_OF_CONTACT # after the boot timeout expires, starting from the init time. self.state.new_fake_instance_state(n2, InstanceState.RUNNING, 90, InstanceHealthState.UNKNOWN) # is terminated and nothing should happen self.state.new_fake_instance_state(n3, InstanceState.TERMINATED, 90, InstanceHealthState.UNKNOWN) # this one will get a heartbeat at 110, just before it would be # marked OUT_OF_CONTACT self.state.new_fake_instance_state(n4, InstanceState.RUNNING, 95, InstanceHealthState.UNKNOWN) # this one will get a heartbeat at 105, just before it would be # marked OUT_OF_CONTACT self.state.new_fake_instance_state(n5, InstanceState.RUNNING, 95, InstanceHealthState.OK) # this instance was already marked as errored before the recovery self.state.new_fake_instance_state(n6, InstanceState.RUNNING, 95, InstanceHealthState.PROCESS_ERROR) # this instance was a ZOMBIE, it should be initially marked back as # UNKNOWN and then if a heartbeat arrives it should be ZOMBIE again self.state.new_fake_instance_state(n7, InstanceState.TERMINATED, 80, InstanceHealthState.ZOMBIE) self.epum._doctor_appt(100) self.assertNodeState(InstanceHealthState.OK, n1, n5) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.epum._doctor_appt(105) self.assertNodeState(InstanceHealthState.OK, n1, n5) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.ok_heartbeat(n5, 105) self.ok_heartbeat(n7, 105) # this one will be relabeled as a zombie self.err_heartbeat(n6, 105, procs=['a']) self.epum._doctor_appt(106) self.assertNodeState(InstanceHealthState.OK, n5) self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n1) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.ZOMBIE, n7) self.ok_heartbeat(n5, 110) self.epum._doctor_appt(110) self.assertNodeState(InstanceHealthState.OK, n5) # n1 has now been "out of contact" too long and is past the "really missing" # threshold, so it should now be MISSING self.assertNodeState(InstanceHealthState.MISSING, n1) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.ZOMBIE, n7) self.ok_heartbeat(n4, 110) self.err_heartbeat(n6, 110, procs=['a']) self.epum._doctor_appt(111) self.assertNodeState(InstanceHealthState.OK, n5, n4) self.assertNodeState(InstanceHealthState.MISSING, n1) self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n2) self.assertNodeState(InstanceHealthState.UNKNOWN, n3) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.ZOMBIE, n7) def test_basic(self): self.epum.initialize() self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, self._dom_config()) nodes = [str(uuid.uuid4()) for i in range(3)] n1, n2, n3 = nodes # not using real timestamps now = 0 for n in nodes: self.state.new_fake_instance_state(n, InstanceState.RUNNING, now) # all nodes are running but haven't been heard from self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes) self.epum._doctor_appt(now) self.assertEquals(0, self.epum.doctor.monitors[self.domain_key].init_time) self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes) now = 5 self.epum._doctor_appt(now) self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes) # first heartbeat to n1 self.ok_heartbeat(n1, now) self.assertNodeState(InstanceHealthState.OK, n1) now = 10 self.epum._doctor_appt(now) self.assertNodeState(InstanceHealthState.OK, n1) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3) self.ok_heartbeat(n1, now) # n1 makes it in under the wire self.ok_heartbeat(n2, now) now = 11 self.epum._doctor_appt(now) self.assertNodeState(InstanceHealthState.OK, n1, n2) self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n3) self.ok_heartbeat(n3, now) self.assertNodeState(InstanceHealthState.OK, *nodes) # ok don't hear from n2 for a while, should go missing now = 13 self.ok_heartbeat(n1, now) now = 16 self.epum._doctor_appt(now) self.assertNodeState(InstanceHealthState.OK, n1, n3) self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n2) self.ok_heartbeat(n2, now) self.assertNodeState(InstanceHealthState.OK, *nodes) now = 20 # roll all nodes to terminated in IaaS for n in nodes: self.state.new_fake_instance_state(n, InstanceState.TERMINATED, now) # been longer than missing window for n1 but shouldn't matter self.epum._doctor_appt(now) self.assertNodeState(InstanceHealthState.OK, *nodes) now = 30 self.ok_heartbeat(n1, now) self.epum._doctor_appt(now) # not a zombie yet self.assertNodeState(InstanceHealthState.OK, *nodes) now = 31 self.epum._doctor_appt(now) self.assertNodeState(InstanceHealthState.OK, n1) self.ok_heartbeat(n1, now) self.epum._doctor_appt(now) self.assertNodeState(InstanceHealthState.ZOMBIE, n1) now = 42 self.epum._doctor_appt(now) self.assertNodeState(InstanceHealthState.UNKNOWN, n1) def test_error(self): self.epum.initialize() self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, self._dom_config()) node = str(uuid.uuid4()) now = 1 self.state.new_fake_instance_state(node, InstanceState.RUNNING, now) self.ok_heartbeat(node, now) self.epum._doctor_appt(now) self.assertNodeState(InstanceHealthState.OK, node) now = 5 self.err_heartbeat(node, now) self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node) errors = self.state.instances[node].errors self.assertEqual(len(errors), 1) self.assertEqual(errors[0], 'faiiiill') self.epum._doctor_appt(now) self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node) def test_process_error(self): self.epum.initialize() self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, self._dom_config()) node = str(uuid.uuid4()) now = 1 self.state.new_fake_instance_state(node, InstanceState.RUNNING, now) self.ok_heartbeat(node, now) self.epum._doctor_appt(now) self.assertNodeState(InstanceHealthState.OK, node) now = 5 procs = [{'name': 'proc1', 'stderr': 'faaaaaail', 'state': 100, 'exitcode': -1, 'stop_timestamp': 25242}] self.err_heartbeat(node, now, procs) self.epum._doctor_appt(now) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node) errors = self.state.instances[node].errors self.assertEqual(len(errors), 1) self.assertEqual(errors[0]['stderr'], 'faaaaaail') procs[0].pop('stderr') now = 8 self.err_heartbeat(node, now, procs) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node) errors = self.state.instances[node].errors self.assertEqual(len(errors), 1) self.assertEqual(errors[0]['stderr'], 'faaaaaail') def test_defibulator(self): self.epum.initialize() dom_config = self._dom_config(health_init_time=100) self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, dom_config) self.ou_client.dump_state_called = 0 self.ou_client.heartbeats_sent = 0 self.ou_client.respond_to_dump_state = True # set up an instance that reached its iaas_state before the init time (100) n1 = "n1" # has been running for well longer than the missing timeout and we will # have not received a heartbeat. It shouldn't be marked OUT_OF_CONTACT # until more than 5 seconds after the init_time self.state.new_fake_instance_state(n1, InstanceState.RUNNING, 50, InstanceHealthState.OK) self.epum._doctor_appt(100) self.assertNodeState(InstanceHealthState.OK, n1) self.epum._doctor_appt(105) self.assertNodeState(InstanceHealthState.OK, n1) self.assertEquals(0, self.ou_client.dump_state_called) self.assertEquals(0, self.ou_client.heartbeats_sent) self.epum._doctor_appt(106) # back to OK self.assertNodeState(InstanceHealthState.OK, n1) self.assertEquals(1, self.ou_client.dump_state_called) self.assertEquals(1, self.ou_client.heartbeats_sent) def test_defibulator_failure(self): self.epum.initialize() dom_config = self._dom_config(health_init_time=100) self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, dom_config) self.ou_client.dump_state_called = 0 self.ou_client.heartbeats_sent = 0 self.ou_client.respond_to_dump_state = False # i.e., the node is really gone # set up an instance that reached its iaas_state before the init time (100) n1 = "Poor Yorick" # has been running for well longer than the missing timeout and we will # have not received a heartbeat. It shouldn't be marked OUT_OF_CONTACT # until more than 5 seconds after the init_time self.state.new_fake_instance_state(n1, InstanceState.RUNNING, 50, InstanceHealthState.OK) self.epum._doctor_appt(100) self.assertNodeState(InstanceHealthState.OK, n1) self.epum._doctor_appt(105) self.assertNodeState(InstanceHealthState.OK, n1) self.assertEquals(0, self.ou_client.dump_state_called) self.assertEquals(0, self.ou_client.heartbeats_sent) self.epum._doctor_appt(106) self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n1) self.assertEquals(1, self.ou_client.dump_state_called) self.assertEquals(0, self.ou_client.heartbeats_sent) self.epum._doctor_appt(110) self.assertNodeState(InstanceHealthState.MISSING, n1) self.assertEquals(1, self.ou_client.dump_state_called) self.assertEquals(0, self.ou_client.heartbeats_sent) # ---------------------------------------------------------------------------------- def assertNodeState(self, state, *node_ids): for n in node_ids: self.assertEqual(state, self.state.instances[n].health) def ok_heartbeat(self, node_id, timestamp): msg = {'node_id': node_id, 'timestamp': timestamp, 'state': InstanceHealthState.OK} self.epum.msg_heartbeat(None, msg, timestamp=timestamp) def err_heartbeat(self, node_id, timestamp, procs=None): msg = {'node_id': node_id, 'timestamp': timestamp, } if procs: msg['state'] = InstanceHealthState.PROCESS_ERROR msg['failed_processes'] = procs else: msg['state'] = InstanceHealthState.MONITOR_ERROR msg['error'] = 'faiiiill' self.epum.msg_heartbeat(None, msg, timestamp=timestamp)