class PDDoctorTests(unittest.TestCase, StoreTestMixin): engine_conf = {'engine1': {'slots': 4, 'heartbeat_period': 5, 'heartbeat_warning': 10, 'heartbeat_missing': 20}} def setUp(self): self.store = self.setup_store() self.registry = EngineRegistry.from_config(self.engine_conf) self.resource_client = MockResourceClient() self.notifier = MockNotifier() self.core = ProcessDispatcherCore(self.store, self.registry, self.resource_client, self.notifier) self.doctor = PDDoctor(self.core, self.store) self.docthread = None self.monitor = None def tearDown(self): if self.docthread: self.doctor.cancel() self.docthread.join() self.docthread = None self.teardown_store() def setup_store(self): return ProcessDispatcherStore() def teardown_store(self): return def _run_in_thread(self): self.docthread = tevent.spawn(self.doctor.inaugurate) time.sleep(0.05) def test_uninitialized_system_boot_with_state(self): self.store.set_system_boot(True) self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING) resource_id = "eeagent_1" self.core.ee_heartbeat(resource_id, make_beat("node1")) p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.RUNNING, configuration=nosystemrestart_process_config(), assigned=resource_id, restart_mode=RestartMode.ALWAYS) self.store.add_process(p0) p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING, assigned=resource_id) self.store.add_process(p1) p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.PENDING, assigned=resource_id) self.store.add_process(p2) p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATING, assigned=resource_id) self.store.add_process(p3) # this one shouldn't restart p4 = ProcessRecord.new(None, "proc4", {}, ProcessState.RUNNING, configuration=nosystemrestart_process_config(), assigned=resource_id, restart_mode=RestartMode.ABNORMAL) self.store.add_process(p4) # non-running proceses should also potentially be restarted on boot p5 = ProcessRecord.new(None, "proc5", {}, ProcessState.WAITING) self.store.add_process(p5) self.store.enqueue_process(*p5.key) p6 = ProcessRecord.new(None, "proc6", {}, ProcessState.REQUESTED) self.store.add_process(p6) # not this one, due to RestartMode p7 = ProcessRecord.new(None, "proc7", {}, ProcessState.REQUESTED, configuration=nosystemrestart_process_config(), restart_mode=RestartMode.ALWAYS) self.store.add_process(p7) self.store.enqueue_process(*p7.key) resource = self.store.get_resource(resource_id) resource.assigned = [p0.key, p1.key, p2.key, p3.key, p4.key] self.store.update_resource(resource) restartable_procs = ["proc1", "proc2", "proc5", "proc6"] dead_procs = ["proc0", "proc4", "proc7"] self._run_in_thread() assert self.store.wait_initialized(timeout=10) self.assertEqual(len(self.store.get_queued_processes()), 0) self.assertEqual(len(self.store.get_node_ids()), 0) self.assertEqual(len(self.store.get_resource_ids()), 0) for proc in restartable_procs: self.assertEqual(self.store.get_process(None, proc).state, ProcessState.UNSCHEDULED_PENDING) for proc in dead_procs: self.assertEqual(self.store.get_process(None, proc).state, ProcessState.TERMINATED) self.assertEqual(self.store.get_process(None, "proc3").state, ProcessState.TERMINATED) self.assertEqual(self.store.get_pd_state(), ProcessDispatcherState.SYSTEM_BOOTING) # now end system boot self.store.set_system_boot(False) wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK) # check that pending processes were correctly rescheduled self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs)) for proc in restartable_procs: self.assertEqual(self.store.get_process(None, proc).state, ProcessState.REQUESTED) def test_uninitialized_system_boot_without_state(self): self.store.set_system_boot(True) self._run_in_thread() assert self.store.wait_initialized(timeout=10) self.assertEqual(self.store.get_pd_state(), ProcessDispatcherState.SYSTEM_BOOTING) self.store.set_system_boot(False) wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK) def test_uninitialized_not_system_boot_with_procs(self): # tests the case where doctor arrives to an uninitialized system # that is not doing a system boot. HOWEVER, there are procs in the # UNSCHEDULED_PENDING state. This would likely only happen if the # PD died during system boot and recovered after the system boot flag # was turned off. Very small window, but possible. p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.UNSCHEDULED_PENDING) self.store.add_process(p0) p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.UNSCHEDULED_PENDING) self.store.add_process(p1) p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.UNSCHEDULED_PENDING) self.store.add_process(p2) restartable_procs = ["proc0", "proc1", "proc2"] self._run_in_thread() assert self.store.wait_initialized(timeout=10) self.assertEqual(self.store.get_pd_state(), ProcessDispatcherState.OK) # check that pending processes were correctly rescheduled self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs)) for proc in restartable_procs: self.assertEqual(self.store.get_process(None, proc).state, ProcessState.REQUESTED) def test_uninitialized_not_system_boot_without_procs(self): # tests the case where doctor arrives to an uninitialized system # that is not doing a system boot and has no UNSCHEDULED_PENDING procs. # this is likely a recovery from all-PD-workers failing and resuming in # a running system, or Zookeeper issue self._run_in_thread() assert self.store.wait_initialized(timeout=10) self.assertEqual(self.store.get_pd_state(), ProcessDispatcherState.OK) def test_initialized_system_boot_with_procs(self): # tests the case where just the doctor dies in the middle of system boot # but after a doctor has already declared the system initialized. In this # case we have processes in the UNSCHEDULED_PENDING state that should be # rescheduled once system boot ends. self.store.set_system_boot(True) self.store.set_initialized() self.store.set_pd_state(ProcessDispatcherState.SYSTEM_BOOTING) p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.UNSCHEDULED_PENDING) self.store.add_process(p0) p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.UNSCHEDULED_PENDING) self.store.add_process(p1) p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.UNSCHEDULED_PENDING) self.store.add_process(p2) restartable_procs = ["proc0", "proc1", "proc2"] self._run_in_thread() # now end system boot self.store.set_system_boot(False) wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK) # check that pending processes were correctly rescheduled self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs)) for proc in restartable_procs: self.assertEqual(self.store.get_process(None, proc).state, ProcessState.REQUESTED) def test_initialized_system_boot_without_procs(self): # tests the case where just the doctor dies in the middle of system boot # but after a doctor has already declared the system initialized. In this # case we have no processes to schedule on system boot completion. self.store.set_system_boot(True) self.store.set_initialized() self.store.set_pd_state(ProcessDispatcherState.SYSTEM_BOOTING) self._run_in_thread() # now end system boot self.store.set_system_boot(False) wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK) def test_initialized_not_system_boot(self): # recover into an already initialized and booted system. this is likely # a recovery from a doctor failure while the rest of the system was still # alive. self.store.set_initialized() self.store.set_pd_state(ProcessDispatcherState.OK) self._run_in_thread() # we have nothing really to check here, yet. but at least we can make sure # the process is cancellable. def test_monitor_thread(self): self._run_in_thread() assert self.store.wait_initialized(timeout=10) self.assertEqual(self.store.get_pd_state(), ProcessDispatcherState.OK) self.assertIsNotNone(self.doctor.monitor) monitor_thread = self.doctor.monitor_thread self.assertIsNotNone(monitor_thread) self.assertTrue(monitor_thread.is_alive()) # now cancel doctor. monitor should stop too self.doctor.cancel() wait(lambda: not monitor_thread.is_alive()) def _setup_resource_monitor(self): self.monitor = ExecutionResourceMonitor(self.core, self.store) return self.monitor def _send_heartbeat(self, resource_id, node_id, timestamp): self.core.ee_heartbeat(resource_id, make_beat(node_id, timestamp=timestamp)) def assert_monitor_cycle(self, expected_delay, resource_states=None): self.assertEqual(expected_delay, self.monitor.monitor_cycle()) if resource_states: for resource_id, expected_state in resource_states.iteritems(): found_state = self.store.get_resource(resource_id).state if found_state != expected_state: self.fail("Resource %s state = %s. Expected %s" % (resource_id, found_state, expected_state)) def test_resource_monitor(self): t0 = datetime(2012, 3, 13, 9, 30, 0, tzinfo=UTC) mock_now = Mock() mock_now.return_value = t0 def increment_now(seconds): t = mock_now.return_value + timedelta(seconds=seconds) mock_now.return_value = t log.debug("THE TIME IS NOW: %s", t) return t monitor = self._setup_resource_monitor() monitor._now_func = mock_now # before there are any resources, monitor should work but return a None delay self.assertIsNone(monitor.monitor_cycle()) self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING) # 3 resources. all report in at t0 r1, r2, r3 = "eeagent_1", "eeagent_2", "eeagent_3" self._send_heartbeat(r1, "node1", t0) self._send_heartbeat(r2, "node1", t0) self._send_heartbeat(r3, "node1", t0) states = {r1: ExecutionResourceState.OK, r2: ExecutionResourceState.OK, r3: ExecutionResourceState.OK} self.assert_monitor_cycle(10, states) t1 = increment_now(5) # :05 # heartbeat comes in for r1 5 seconds later self._send_heartbeat(r1, "node1", t1) self.assert_monitor_cycle(5, states) increment_now(5) # :10 # no heartbeats for r2 and r3. they should be marked WARNING states[r2] = ExecutionResourceState.WARNING states[r3] = ExecutionResourceState.WARNING self.assert_monitor_cycle(5, states) increment_now(4) # :14 # r2 gets a heartbeat through, but its timestamp puts it still in the warning threshold self._send_heartbeat(r2, "node1", t0 + timedelta(seconds=1)) self.assert_monitor_cycle(1, states) increment_now(6) # :20 # r1 should go warning, r3 should go missing states[r1] = ExecutionResourceState.WARNING states[r3] = ExecutionResourceState.MISSING self.assert_monitor_cycle(4, states) t2 = increment_now(3) # :23 self._send_heartbeat(r1, "node1", t2) states[r1] = ExecutionResourceState.OK self.assert_monitor_cycle(1, states) t3 = increment_now(2) # :25 self._send_heartbeat(r3, "node1", t3) states[r2] = ExecutionResourceState.MISSING states[r3] = ExecutionResourceState.OK self.assert_monitor_cycle(8, states) increment_now(5) # :30 # hearbeat r2 enough to go back to WARNING, but still late self.core.ee_heartbeat(r2, make_beat("node1", timestamp=t0 + timedelta(seconds=15))) self._send_heartbeat(r2, "node1", t0 + timedelta(seconds=15)) states[r2] = ExecutionResourceState.WARNING self.assert_monitor_cycle(3, states) t4 = increment_now(5) # :35 # disable r2 and heartbeat r1 and r3 (heartbeats arrive late, but that's ok) self._send_heartbeat(r1, "node1", t4) self._send_heartbeat(r3, "node1", t4) self.core.resource_change_state(self.store.get_resource(r2), ExecutionResourceState.DISABLED) states[r2] = ExecutionResourceState.DISABLED self.assert_monitor_cycle(10, states)
class ProcessDispatcherService(object): """PD service interface """ def __init__(self, amqp_uri=None, topic="process_dispatcher", registry=None, store=None, epum_client=None, notifier=None, definition_id=None, domain_config=None, sysname=None): configs = ["service", "processdispatcher"] config_files = get_config_paths(configs) self.CFG = bootstrap.configure(config_files) self.topic = self.CFG.processdispatcher.get('service_name', topic) self.dashi = bootstrap.dashi_connect(self.topic, self.CFG, amqp_uri=amqp_uri, sysname=sysname) engine_conf = self.CFG.processdispatcher.get('engines', {}) default_engine = self.CFG.processdispatcher.get('default_engine') process_engines = self.CFG.processdispatcher.get('process_engines') if default_engine is None and len(engine_conf.keys()) == 1: default_engine = engine_conf.keys()[0] self.store = store or get_processdispatcher_store(self.CFG) self.store.initialize() self.registry = registry or EngineRegistry.from_config(engine_conf, default=default_engine, process_engines=process_engines) self.eeagent_client = EEAgentClient(self.dashi) domain_definition_id = None base_domain_config = None # allow disabling communication with EPUM for epuharness case if epum_client: self.epum_client = epum_client domain_definition_id = definition_id base_domain_config = domain_config elif not self.CFG.processdispatcher.get('static_resources'): domain_definition_id = definition_id or self.CFG.processdispatcher.get('definition_id') base_domain_config = domain_config or self.CFG.processdispatcher.get('domain_config') epum_service_name = self.CFG.processdispatcher.get('epum_service_name', 'epu_management_service') self.epum_client = EPUManagementClient(self.dashi, epum_service_name) else: self.epum_client = None if notifier: self.notifier = notifier else: self.notifier = SubscriberNotifier(self.dashi) self.core = ProcessDispatcherCore(self.store, self.registry, self.eeagent_client, self.notifier) launch_type = self.CFG.processdispatcher.get('launch_type', 'supd') restart_throttling_config = self.CFG.processdispatcher.get('restart_throttling_config', {}) dispatch_retry_seconds = self.CFG.processdispatcher.get('dispatch_retry_seconds') self.matchmaker = PDMatchmaker(self.core, self.store, self.eeagent_client, self.registry, self.epum_client, self.notifier, self.topic, domain_definition_id, base_domain_config, launch_type, restart_throttling_config, dispatch_retry_seconds) self.doctor = PDDoctor(self.core, self.store, config=self.CFG) self.ready_event = threading.Event() def start(self): # start the doctor before we do anything else log.debug("Starting doctor election") self.doctor.start_election() log.debug("Waiting for Doctor to initialize the Process Dispatcher") # wait for the store to be initialized before proceeding. The doctor # (maybe not OUR doctor, but whoever gets elected), will check the # state of the system and then mark it as initialized. self.store.wait_initialized() epu.dashiproc.link_dashi_exceptions(self.dashi) self.dashi.handle(self.set_system_boot) self.dashi.handle(self.create_definition) self.dashi.handle(self.describe_definition) self.dashi.handle(self.update_definition) self.dashi.handle(self.remove_definition) self.dashi.handle(self.list_definitions) self.dashi.handle(self.create_process) self.dashi.handle(self.schedule_process) self.dashi.handle(self.describe_process) self.dashi.handle(self.describe_processes) self.dashi.handle(self.restart_process) self.dashi.handle(self.terminate_process) self.dashi.handle(self.node_state) self.dashi.handle(self.heartbeat, sender_kwarg='sender') self.dashi.handle(self.dump) self.matchmaker.start_election() self.ready_event.set() try: self.dashi.consume() except KeyboardInterrupt: log.warning("Caught terminate signal. Bye!") else: log.info("Exiting normally. Bye!") def stop(self): self.ready_event.clear() self.dashi.cancel() self.dashi.disconnect() self.store.shutdown() def _make_process_dict(self, proc): return dict(upid=proc.upid, state=proc.state, round=proc.round, assigned=proc.assigned) def set_system_boot(self, system_boot): self.core.set_system_boot(system_boot) def create_definition(self, definition_id, definition_type, executable, name=None, description=None): self.core.create_definition(definition_id, definition_type, executable, name=name, description=description) def describe_definition(self, definition_id): return self.core.describe_definition(definition_id) def update_definition(self, definition_id, definition_type, executable, name=None, description=None): self.core.update_definition(definition_id, definition_type, executable, name=name, description=description) def remove_definition(self, definition_id): self.core.remove_definition(definition_id) def list_definitions(self): return self.core.list_definitions() def create_process(self, upid, definition_id, name=None): result = self.core.create_process(None, upid, definition_id, name=name) return self._make_process_dict(result) def schedule_process(self, upid, definition_id=None, configuration=None, subscribers=None, constraints=None, queueing_mode=None, restart_mode=None, execution_engine_id=None, node_exclusive=None, name=None): result = self.core.schedule_process(None, upid=upid, definition_id=definition_id, configuration=configuration, subscribers=subscribers, constraints=constraints, queueing_mode=queueing_mode, restart_mode=restart_mode, node_exclusive=node_exclusive, execution_engine_id=execution_engine_id, name=name) return self._make_process_dict(result) def describe_process(self, upid): return self.core.describe_process(None, upid) def describe_processes(self): return self.core.describe_processes() def restart_process(self, upid): result = self.core.restart_process(None, upid) return self._make_process_dict(result) def terminate_process(self, upid): result = self.core.terminate_process(None, upid) return self._make_process_dict(result) def node_state(self, node_id, domain_id, state, properties=None): self.core.node_state(node_id, domain_id, state, properties=properties) def heartbeat(self, sender, message): log.debug("got heartbeat from %s: %s", sender, message) self.core.ee_heartbeat(sender, message) def dump(self): return self.core.dump()
class ProcessDispatcherCoreTests(unittest.TestCase): engine_conf = {"engine1": {"slots": 4}, "engine2": {"slots": 4}, "engine3": {"slots": 2}, "engine4": {"slots": 2}} def setUp(self): self.store = self.get_store() self.registry = EngineRegistry.from_config(self.engine_conf) self.resource_client = Mock() self.notifier = MockNotifier() self.core = ProcessDispatcherCore(self.store, self.registry, self.resource_client, self.notifier) def get_store(self): return ProcessDispatcherStore() def test_add_remove_node(self): self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING) node = self.store.get_node("node1") self.assertTrue(node is not None) self.assertEqual(node.node_id, "node1") self.assertEqual(node.domain_id, domain_id_from_engine("engine1")) self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATING) node = self.store.get_node("node1") self.assertTrue(node is None) # this shouldn't cause any problems even though node is gone self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATED) def test_add_remove_node_with_resource(self): self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING) resource_id = "eeagent_1" self.core.ee_heartbeat(resource_id, make_beat("node1")) resource = self.store.get_resource(resource_id) self.assertIsNotNone(resource) self.assertEqual(resource.state, ExecutionResourceState.OK) # now send a terminated state for the node. resource should be removed. self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATED) self.assertTrue(self.store.get_resource(resource_id) is None) self.assertTrue(self.store.get_node("node1") is None) def test_add_remove_node_with_resource_and_processes(self): self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING) resource_id = "eeagent_1" self.core.ee_heartbeat(resource_id, make_beat("node1")) # set up a few of processes on the resource p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING, assigned=resource_id) self.store.add_process(p1) p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.PENDING, assigned=resource_id) self.store.add_process(p2) p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATING, assigned=resource_id) self.store.add_process(p3) resource = self.store.get_resource(resource_id) resource.assigned = [p1.key, p2.key, p3.key] self.store.update_resource(resource) # now send a terminated state for the node. resource should be removed. self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATED) self.assertTrue(self.store.get_resource(resource_id) is None) self.assertTrue(self.store.get_node("node1") is None) queued_processes = set(self.store.get_queued_processes()) # these two should have been rescheduled for procname in ("proc1", "proc2"): proc = self.store.get_process(None, procname) self.assertEqual(proc.state, ProcessState.DIED_REQUESTED) self.assertEqual(proc.round, 1) self.assertIn(proc.key, queued_processes) self.notifier.assert_process_state(procname, ProcessState.DIED_REQUESTED) # this one should be terminated proc3 = self.store.get_process(None, "proc3") self.assertEqual(proc3.state, ProcessState.TERMINATED) self.assertEqual(proc3.round, 0) self.assertNotIn(proc3.key, queued_processes) self.notifier.assert_process_state("proc3", ProcessState.TERMINATED) def test_terminate_not_found(self): # process which doesn't exist with self.assertRaises(NotFoundError): self.core.terminate_process(None, "notarealprocess") def test_terminate_terminal_process(self): # processes which are already in a terminal state shouldn't change p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.UNSCHEDULED) p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.UNSCHEDULED_PENDING) p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATED) p4 = ProcessRecord.new(None, "proc4", {}, ProcessState.EXITED) p5 = ProcessRecord.new(None, "proc5", {}, ProcessState.FAILED) p6 = ProcessRecord.new(None, "proc6", {}, ProcessState.REJECTED) for p in (p1, p2, p3, p4, p5, p6): self.store.add_process(p) for p in (p1, p2, p3, p4, p5, p6): gotproc = self.core.terminate_process(None, p.upid) self.assertEqual(gotproc.upid, p.upid) self.assertEqual(gotproc.state, p.state) p1 = self.store.get_process(None, p.upid) self.assertEqual(p1.state, p.state) self.assertEqual(self.resource_client.call_count, 0) def test_terminate_unassigned_process(self): p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.WAITING) self.store.add_process(p1) self.store.enqueue_process(*p1.key) gotproc = self.core.terminate_process(None, "proc1") self.assertEqual(gotproc.upid, "proc1") self.assertEqual(gotproc.state, ProcessState.TERMINATED) p1 = self.store.get_process(None, "proc1") self.assertEqual(p1.state, ProcessState.TERMINATED) self.notifier.assert_process_state("proc1", ProcessState.TERMINATED) # should be gone from queue too self.assertFalse(self.store.get_queued_processes()) self.assertEqual(self.resource_client.call_count, 0) def test_terminate_raciness(self): # ensure process is TERMINATING before resource client is called p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING) p1.assigned = "hats" self.store.add_process(p1) def assert_process_terminating(resource_id, upid, round): self.assertEqual(resource_id, "hats") self.assertEqual(upid, "proc1") process = self.store.get_process(None, upid) self.assertEqual(process.state, ProcessState.TERMINATING) self.resource_client.terminate_process.side_effect = assert_process_terminating self.core.terminate_process(None, "proc1") self.resource_client.terminate_process.assert_called_once_with("hats", "proc1", 0) self.notifier.assert_process_state("proc1", ProcessState.TERMINATING) def test_terminate_assigned(self): p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.ASSIGNED) p1.assigned = "hats" self.store.add_process(p1) self.core.terminate_process(None, "proc1") self.resource_client.terminate_process.assert_called_once_with("hats", "proc1", 0) self.notifier.assert_process_state("proc1", ProcessState.TERMINATING) def test_terminate_retry(self): # try to kill a process that is already terminating p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.TERMINATING) p1.assigned = "hats" self.store.add_process(p1) self.core.terminate_process(None, "proc1") self.resource_client.terminate_process.assert_called_once_with("hats", "proc1", 0) self.notifier.assert_no_process_state() def test_process_subscribers(self): proc = "proc1" definition = "def1" subscribers = [("destination", "operation")] self.core.create_definition(definition, None, None) self.core.create_process(None, proc, definition) self.core.schedule_process(None, proc, subscribers=subscribers) record = self.store.get_process(None, proc) self.assertEqual(len(record.subscribers), len(subscribers)) for a, b in zip(record.subscribers, subscribers): self.assertEqual(a[0], b[0]) self.assertEqual(a[1], b[1]) def test_schedule_notfound(self): # scheduling an unknown process proc = "proc1" with self.assertRaises(NotFoundError): self.core.schedule_process(None, proc) def test_schedule_new_process(self): proc = "proc1" definition = "def1" self.core.create_definition(definition, None, None) process = self.core.schedule_process(None, proc, definition) self.assertEqual(process.state, ProcessState.REQUESTED) self.assertEqual(process.upid, proc) def test_create_idempotency(self): proc = "proc1" definition = "def1" another_definition = "def2" self.core.create_definition(definition, None, None) self.core.create_definition(another_definition, None, None) process = self.core.create_process(None, proc, definition) self.assertEqual(process.state, ProcessState.UNSCHEDULED) self.assertEqual(process.upid, proc) # calling again is fine process = self.core.create_process(None, proc, definition) self.assertEqual(process.state, ProcessState.UNSCHEDULED) self.assertEqual(process.upid, proc) # with a different definition is not fine with self.assertRaises(BadRequestError): self.core.create_process(None, proc, another_definition) # nor with a different name with self.assertRaises(BadRequestError): self.core.create_process(None, proc, definition, name="hats") def test_schedule_idempotency(self): proc = "proc1" definition = "def1" self.core.create_definition(definition, None, None) process = self.core.create_process(None, proc, definition) self.assertEqual(process.state, ProcessState.UNSCHEDULED) self.assertEqual(process.upid, proc) process = self.core.schedule_process(None, proc) self.assertEqual(process.state, ProcessState.REQUESTED) self.assertEqual(process.upid, proc) # calling again is fine process = self.core.schedule_process(None, proc) self.assertEqual(process.state, ProcessState.REQUESTED) self.assertEqual(process.upid, proc) # with a different parameter is not fine with self.assertRaises(BadRequestError): self.core.schedule_process(None, proc, restart_mode=RestartMode.ALWAYS) with self.assertRaises(BadRequestError): self.core.schedule_process(None, proc, queueing_mode=QueueingMode.START_ONLY) def test_schedule_idempotency_procname(self): proc = "proc1" definition = "def1" self.core.create_definition(definition, None, None) # special case: changing process name is ok process = self.core.create_process(None, proc, definition, name="name1") self.assertEqual(process.state, ProcessState.UNSCHEDULED) self.assertEqual(process.upid, proc) process = self.core.schedule_process(None, proc, name="name2") self.assertEqual(process.state, ProcessState.REQUESTED) self.assertEqual(process.upid, proc) # special case: different process name is ok process = self.core.schedule_process(None, proc, name="name3") self.assertEqual(process.state, ProcessState.REQUESTED) self.assertEqual(process.upid, proc) def test_process_should_restart(self): definition = "def1" self.core.create_definition(definition, None, None) abnormal_states = (ProcessState.TERMINATED, ProcessState.TERMINATING, ProcessState.FAILED) all_states = (ProcessState.TERMINATED, ProcessState.TERMINATING, ProcessState.FAILED, ProcessState.EXITED) # default behavior is to restart processes that exit abnormally process = self.core.schedule_process(None, uuid.uuid4().hex, definition) for state in abnormal_states: self.assertTrue(self.core.process_should_restart(process, state)) # system restart mode doesn't matter self.assertTrue(self.core.process_should_restart(process, state, is_system_restart=True)) self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED)) self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED, is_system_restart=True)) # same with explicit RestartMode.ABNORMAL specified process = self.core.schedule_process(None, uuid.uuid4().hex, definition, restart_mode=RestartMode.ABNORMAL) for state in abnormal_states: self.assertTrue(self.core.process_should_restart(process, state)) self.assertTrue(self.core.process_should_restart(process, state, is_system_restart=True)) self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED)) self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED, is_system_restart=True)) # RestartMode.NEVER process = self.core.schedule_process(None, uuid.uuid4().hex, definition, restart_mode=RestartMode.NEVER) for state in all_states: self.assertFalse(self.core.process_should_restart(process, state)) self.assertFalse(self.core.process_should_restart(process, state, is_system_restart=True)) # RestartMode.ALWAYS process = self.core.schedule_process(None, uuid.uuid4().hex, definition, restart_mode=RestartMode.ALWAYS) for state in all_states: self.assertTrue(self.core.process_should_restart(process, state)) self.assertTrue(self.core.process_should_restart(process, state, is_system_restart=True)) # RestartMode.ALWAYS with process.omit_from_system_restart process = self.core.schedule_process( None, uuid.uuid4().hex, definition, restart_mode=RestartMode.ALWAYS, configuration=nosystemrestart_process_config(), ) for state in all_states: self.assertTrue(self.core.process_should_restart(process, state)) self.assertFalse(self.core.process_should_restart(process, state, is_system_restart=True)) # RestartMode.ABNORMAL with process.omit_from_system_restart process = self.core.schedule_process( None, uuid.uuid4().hex, definition, restart_mode=RestartMode.ABNORMAL, configuration=nosystemrestart_process_config(), ) for state in abnormal_states: self.assertTrue(self.core.process_should_restart(process, state)) self.assertFalse(self.core.process_should_restart(process, state, is_system_restart=True)) self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED)) self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED, is_system_restart=True)) # ensure that a process with a busted config doesn't raise an error process = self.core.schedule_process( None, uuid.uuid4().hex, definition, restart_mode=RestartMode.ALWAYS, configuration={"process": ["what is a list doing here??"]}, ) for state in all_states: self.assertTrue(self.core.process_should_restart(process, state)) def test_heartbeat_node_update_race(self): # test processing two beats simultaneously, for eeagents in the same node. # check that they don't collide updating the node record node_id = uuid.uuid4().hex self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING) beat = make_beat(node_id) # this beat gets injected while the other is in the midst of processing sneaky_beat = make_beat(node_id) # when the PD attempts to update the process, sneak in an update # first so the request conflicts original_update_node = self.store.update_node def patched_update_node(node): # unpatch ourself first so we don't recurse forever self.store.update_node = original_update_node self.core.ee_heartbeat("eeagent2", sneaky_beat) original_update_node(node) self.store.update_node = patched_update_node self.core.ee_heartbeat("eeagent1", beat) node = self.store.get_node(node_id) self.assertEqual(set(["eeagent1", "eeagent2"]), set(node.resources)) def test_heartbeat_node_removed(self): # test processing a heartbeat where node is removed partway through node_id = uuid.uuid4().hex self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING) beat = make_beat(node_id) original_update_node = self.store.update_node def patched_update_node(node): # unpatch ourself first so we don't recurse forever self.store.update_node = original_update_node self.store.remove_node(node.node_id) original_update_node(node) self.store.update_node = patched_update_node # this shouldn't blow up, and no resource should be added self.core.ee_heartbeat("eeagent1", beat) self.assertEqual(self.store.get_resource("eeagent1"), None) def test_heartbeat_timestamps(self): # test processing a heartbeat where node is removed partway through node_id = uuid.uuid4().hex self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING) d1 = parse_datetime("2013-04-02T19:37:57.617734+00:00") d2 = parse_datetime("2013-04-02T19:38:57.617734+00:00") d3 = parse_datetime("2013-04-02T19:39:57.617734+00:00") self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d1.isoformat())) resource = self.store.get_resource("eeagent1") self.assertEqual(resource.last_heartbeat_datetime, d1) self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d3.isoformat())) resource = self.store.get_resource("eeagent1") self.assertEqual(resource.last_heartbeat_datetime, d3) # out of order hbeat. time shouln't be updated self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d2.isoformat())) resource = self.store.get_resource("eeagent1") self.assertEqual(resource.last_heartbeat_datetime, d3) def test_get_process_constraints(self): """test_get_process_constraints ensure that order of precedence of engine ids is correct. Should be: 1. process target - when a process is scheduled, an execution_engine_id can be specified in the request's ProcessTarget object. If specified, this EE is used. 2. process/engine mappings - the CEI Launch YML file contains a process_engines mapping of process packages to EE names. If the process' module matches an entry in this configuration, the associated EE is chosen. This format is described below. 3. default execution engine - the CEI Launch YML file also must specify a default_execution_engine value. This is used as a last resort. """ self.registry.set_process_engine_mapping("my", "engine4") self.registry.default = "engine1" process_definition = {"executable": {"module": "my.test", "class": "MyClass"}} process_constraints = {"engine": "mostimportantengine"} p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.PENDING) constraints = self.core.get_process_constraints(p1) self.assertEqual(constraints["engine"], self.registry.default) p3 = ProcessRecord.new(None, "proc3", process_definition, ProcessState.PENDING, constraints=process_constraints) constraints = self.core.get_process_constraints(p3) self.assertEqual(constraints["engine"], "mostimportantengine")