예제 #1
0
    def setUp(self):
        self.store = self.setup_store()
        self.registry = EngineRegistry.from_config(self.engine_conf)
        self.resource_client = MockResourceClient()
        self.notifier = MockNotifier()
        self.core = ProcessDispatcherCore(self.store, self.registry,
            self.resource_client, self.notifier)
        self.doctor = PDDoctor(self.core, self.store)

        self.docthread = None

        self.monitor = None
예제 #2
0
class PDDoctorMockTests(unittest.TestCase):

    engine_conf = {'engine1': {'slots': 4, 'heartbeat_period': 5,
                   'heartbeat_warning': 10, 'heartbeat_missing': 20}}

    def setUp(self):
        self.store = Mock()
        self.store.get_pd_state = Mock(return_value=ProcessDispatcherState.OK)
        self.registry = EngineRegistry.from_config(self.engine_conf)
        self.resource_client = MockResourceClient()
        self.notifier = MockNotifier()
        self.core = ProcessDispatcherCore(self.store, self.registry,
            self.resource_client, self.notifier)
        self.doctor = PDDoctor(self.core, self.store)

        self.docthread = None

        self.monitor = None

    def tearDown(self):
        if self.docthread:
            self.doctor.cancel()
            self.docthread.join()
            self.docthread = None

    def test_initialize(self):
        self.doctor.is_leader = True
        self.doctor.config[self.doctor.CONFIG_MONITOR_HEARTBEATS] = False

        def stop_being_leader_in_one_sec():
            time.sleep(1)
            with self.doctor.condition:
                self.doctor.is_leader = False
                self.doctor.condition.notify_all()

        tevent.spawn(stop_being_leader_in_one_sec)
        self.doctor._initialize()
예제 #3
0
    def __init__(self, amqp_uri=None, topic="process_dispatcher", registry=None,
                 store=None, epum_client=None, notifier=None, definition_id=None,
                 domain_config=None, sysname=None):

        configs = ["service", "processdispatcher"]
        config_files = get_config_paths(configs)
        self.CFG = bootstrap.configure(config_files)
        self.topic = self.CFG.processdispatcher.get('service_name', topic)

        self.dashi = bootstrap.dashi_connect(self.topic, self.CFG,
                                             amqp_uri=amqp_uri, sysname=sysname)

        engine_conf = self.CFG.processdispatcher.get('engines', {})
        default_engine = self.CFG.processdispatcher.get('default_engine')
        process_engines = self.CFG.processdispatcher.get('process_engines')
        if default_engine is None and len(engine_conf.keys()) == 1:
            default_engine = engine_conf.keys()[0]
        self.store = store or get_processdispatcher_store(self.CFG)
        self.store.initialize()
        self.registry = registry or EngineRegistry.from_config(engine_conf,
            default=default_engine, process_engines=process_engines)
        self.eeagent_client = EEAgentClient(self.dashi)

        domain_definition_id = None
        base_domain_config = None
        # allow disabling communication with EPUM for epuharness case
        if epum_client:
            self.epum_client = epum_client
            domain_definition_id = definition_id
            base_domain_config = domain_config
        elif not self.CFG.processdispatcher.get('static_resources'):
            domain_definition_id = definition_id or self.CFG.processdispatcher.get('definition_id')
            base_domain_config = domain_config or self.CFG.processdispatcher.get('domain_config')
            epum_service_name = self.CFG.processdispatcher.get('epum_service_name',
                    'epu_management_service')
            self.epum_client = EPUManagementClient(self.dashi, epum_service_name)

        else:
            self.epum_client = None

        if notifier:
            self.notifier = notifier
        else:
            self.notifier = SubscriberNotifier(self.dashi)

        self.core = ProcessDispatcherCore(self.store,
                                          self.registry,
                                          self.eeagent_client,
                                          self.notifier)

        launch_type = self.CFG.processdispatcher.get('launch_type', 'supd')
        restart_throttling_config = self.CFG.processdispatcher.get('restart_throttling_config', {})
        dispatch_retry_seconds = self.CFG.processdispatcher.get('dispatch_retry_seconds')

        self.matchmaker = PDMatchmaker(self.core, self.store, self.eeagent_client,
            self.registry, self.epum_client, self.notifier, self.topic,
            domain_definition_id, base_domain_config, launch_type,
            restart_throttling_config, dispatch_retry_seconds)

        self.doctor = PDDoctor(self.core, self.store, config=self.CFG)
        self.ready_event = threading.Event()
예제 #4
0
class ProcessDispatcherService(object):
    """PD service interface
    """

    def __init__(self, amqp_uri=None, topic="process_dispatcher", registry=None,
                 store=None, epum_client=None, notifier=None, definition_id=None,
                 domain_config=None, sysname=None):

        configs = ["service", "processdispatcher"]
        config_files = get_config_paths(configs)
        self.CFG = bootstrap.configure(config_files)
        self.topic = self.CFG.processdispatcher.get('service_name', topic)

        self.dashi = bootstrap.dashi_connect(self.topic, self.CFG,
                                             amqp_uri=amqp_uri, sysname=sysname)

        engine_conf = self.CFG.processdispatcher.get('engines', {})
        default_engine = self.CFG.processdispatcher.get('default_engine')
        process_engines = self.CFG.processdispatcher.get('process_engines')
        if default_engine is None and len(engine_conf.keys()) == 1:
            default_engine = engine_conf.keys()[0]
        self.store = store or get_processdispatcher_store(self.CFG)
        self.store.initialize()
        self.registry = registry or EngineRegistry.from_config(engine_conf,
            default=default_engine, process_engines=process_engines)
        self.eeagent_client = EEAgentClient(self.dashi)

        domain_definition_id = None
        base_domain_config = None
        # allow disabling communication with EPUM for epuharness case
        if epum_client:
            self.epum_client = epum_client
            domain_definition_id = definition_id
            base_domain_config = domain_config
        elif not self.CFG.processdispatcher.get('static_resources'):
            domain_definition_id = definition_id or self.CFG.processdispatcher.get('definition_id')
            base_domain_config = domain_config or self.CFG.processdispatcher.get('domain_config')
            epum_service_name = self.CFG.processdispatcher.get('epum_service_name',
                    'epu_management_service')
            self.epum_client = EPUManagementClient(self.dashi, epum_service_name)

        else:
            self.epum_client = None

        if notifier:
            self.notifier = notifier
        else:
            self.notifier = SubscriberNotifier(self.dashi)

        self.core = ProcessDispatcherCore(self.store,
                                          self.registry,
                                          self.eeagent_client,
                                          self.notifier)

        launch_type = self.CFG.processdispatcher.get('launch_type', 'supd')
        restart_throttling_config = self.CFG.processdispatcher.get('restart_throttling_config', {})
        dispatch_retry_seconds = self.CFG.processdispatcher.get('dispatch_retry_seconds')

        self.matchmaker = PDMatchmaker(self.core, self.store, self.eeagent_client,
            self.registry, self.epum_client, self.notifier, self.topic,
            domain_definition_id, base_domain_config, launch_type,
            restart_throttling_config, dispatch_retry_seconds)

        self.doctor = PDDoctor(self.core, self.store, config=self.CFG)
        self.ready_event = threading.Event()

    def start(self):

        # start the doctor before we do anything else
        log.debug("Starting doctor election")
        self.doctor.start_election()

        log.debug("Waiting for Doctor to initialize the Process Dispatcher")
        # wait for the store to be initialized before proceeding. The doctor
        # (maybe not OUR doctor, but whoever gets elected), will check the
        # state of the system and then mark it as initialized.
        self.store.wait_initialized()

        epu.dashiproc.link_dashi_exceptions(self.dashi)

        self.dashi.handle(self.set_system_boot)
        self.dashi.handle(self.create_definition)
        self.dashi.handle(self.describe_definition)
        self.dashi.handle(self.update_definition)
        self.dashi.handle(self.remove_definition)
        self.dashi.handle(self.list_definitions)
        self.dashi.handle(self.create_process)
        self.dashi.handle(self.schedule_process)
        self.dashi.handle(self.describe_process)
        self.dashi.handle(self.describe_processes)
        self.dashi.handle(self.restart_process)
        self.dashi.handle(self.terminate_process)
        self.dashi.handle(self.node_state)
        self.dashi.handle(self.heartbeat, sender_kwarg='sender')
        self.dashi.handle(self.dump)

        self.matchmaker.start_election()

        self.ready_event.set()

        try:
            self.dashi.consume()
        except KeyboardInterrupt:
            log.warning("Caught terminate signal. Bye!")
        else:
            log.info("Exiting normally. Bye!")

    def stop(self):
        self.ready_event.clear()
        self.dashi.cancel()
        self.dashi.disconnect()
        self.store.shutdown()

    def _make_process_dict(self, proc):
        return dict(upid=proc.upid, state=proc.state, round=proc.round,
                    assigned=proc.assigned)

    def set_system_boot(self, system_boot):
        self.core.set_system_boot(system_boot)

    def create_definition(self, definition_id, definition_type, executable,
                          name=None, description=None):
        self.core.create_definition(definition_id, definition_type, executable,
            name=name, description=description)

    def describe_definition(self, definition_id):
        return self.core.describe_definition(definition_id)

    def update_definition(self, definition_id, definition_type, executable,
                          name=None, description=None):
        self.core.update_definition(definition_id, definition_type, executable,
            name=name, description=description)

    def remove_definition(self, definition_id):
        self.core.remove_definition(definition_id)

    def list_definitions(self):
        return self.core.list_definitions()

    def create_process(self, upid, definition_id, name=None):
        result = self.core.create_process(None, upid, definition_id, name=name)
        return self._make_process_dict(result)

    def schedule_process(self, upid, definition_id=None, configuration=None,
                         subscribers=None, constraints=None,
                         queueing_mode=None, restart_mode=None,
                         execution_engine_id=None, node_exclusive=None,
                         name=None):

        result = self.core.schedule_process(None, upid=upid,
            definition_id=definition_id, configuration=configuration,
            subscribers=subscribers, constraints=constraints,
            queueing_mode=queueing_mode, restart_mode=restart_mode,
            node_exclusive=node_exclusive,
            execution_engine_id=execution_engine_id, name=name)
        return self._make_process_dict(result)

    def describe_process(self, upid):
        return self.core.describe_process(None, upid)

    def describe_processes(self):
        return self.core.describe_processes()

    def restart_process(self, upid):
        result = self.core.restart_process(None, upid)
        return self._make_process_dict(result)

    def terminate_process(self, upid):
        result = self.core.terminate_process(None, upid)
        return self._make_process_dict(result)

    def node_state(self, node_id, domain_id, state, properties=None):
        self.core.node_state(node_id, domain_id, state, properties=properties)

    def heartbeat(self, sender, message):
        log.debug("got heartbeat from %s: %s", sender, message)
        self.core.ee_heartbeat(sender, message)

    def dump(self):
        return self.core.dump()
예제 #5
0
class PDDoctorTests(unittest.TestCase, StoreTestMixin):

    engine_conf = {'engine1': {'slots': 4, 'heartbeat_period': 5,
                   'heartbeat_warning': 10, 'heartbeat_missing': 20}}

    def setUp(self):
        self.store = self.setup_store()
        self.registry = EngineRegistry.from_config(self.engine_conf)
        self.resource_client = MockResourceClient()
        self.notifier = MockNotifier()
        self.core = ProcessDispatcherCore(self.store, self.registry,
            self.resource_client, self.notifier)
        self.doctor = PDDoctor(self.core, self.store)

        self.docthread = None

        self.monitor = None

    def tearDown(self):
        if self.docthread:
            self.doctor.cancel()
            self.docthread.join()
            self.docthread = None

        self.teardown_store()

    def setup_store(self):
        return ProcessDispatcherStore()

    def teardown_store(self):
        return

    def _run_in_thread(self):
        self.docthread = tevent.spawn(self.doctor.inaugurate)
        time.sleep(0.05)

    def test_uninitialized_system_boot_with_state(self):
        self.store.set_system_boot(True)
        self.core.node_state("node1", domain_id_from_engine("engine1"),
            InstanceState.RUNNING)
        resource_id = "eeagent_1"
        self.core.ee_heartbeat(resource_id, make_beat("node1"))

        p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.RUNNING,
                configuration=nosystemrestart_process_config(),
                assigned=resource_id,
                restart_mode=RestartMode.ALWAYS)
        self.store.add_process(p0)
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING,
                assigned=resource_id)
        self.store.add_process(p1)
        p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.PENDING,
            assigned=resource_id)
        self.store.add_process(p2)
        p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATING,
            assigned=resource_id)
        self.store.add_process(p3)

        # this one shouldn't restart
        p4 = ProcessRecord.new(None, "proc4", {}, ProcessState.RUNNING,
                configuration=nosystemrestart_process_config(),
                assigned=resource_id,
                restart_mode=RestartMode.ABNORMAL)
        self.store.add_process(p4)

        # non-running proceses should also potentially be restarted on boot
        p5 = ProcessRecord.new(None, "proc5", {}, ProcessState.WAITING)
        self.store.add_process(p5)
        self.store.enqueue_process(*p5.key)
        p6 = ProcessRecord.new(None, "proc6", {}, ProcessState.REQUESTED)
        self.store.add_process(p6)

        # not this one, due to RestartMode
        p7 = ProcessRecord.new(None, "proc7", {}, ProcessState.REQUESTED,
            configuration=nosystemrestart_process_config(),
            restart_mode=RestartMode.ALWAYS)
        self.store.add_process(p7)
        self.store.enqueue_process(*p7.key)

        resource = self.store.get_resource(resource_id)
        resource.assigned = [p0.key, p1.key, p2.key, p3.key, p4.key]
        self.store.update_resource(resource)

        restartable_procs = ["proc1", "proc2", "proc5", "proc6"]
        dead_procs = ["proc0", "proc4", "proc7"]

        self._run_in_thread()

        assert self.store.wait_initialized(timeout=10)

        self.assertEqual(len(self.store.get_queued_processes()), 0)
        self.assertEqual(len(self.store.get_node_ids()), 0)
        self.assertEqual(len(self.store.get_resource_ids()), 0)

        for proc in restartable_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.UNSCHEDULED_PENDING)
        for proc in dead_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.TERMINATED)
        self.assertEqual(self.store.get_process(None, "proc3").state,
                         ProcessState.TERMINATED)

        self.assertEqual(self.store.get_pd_state(),
                         ProcessDispatcherState.SYSTEM_BOOTING)

        # now end system boot
        self.store.set_system_boot(False)

        wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)

        # check that pending processes were correctly rescheduled
        self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs))
        for proc in restartable_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.REQUESTED)

    def test_uninitialized_system_boot_without_state(self):
        self.store.set_system_boot(True)
        self._run_in_thread()

        assert self.store.wait_initialized(timeout=10)
        self.assertEqual(self.store.get_pd_state(),
                         ProcessDispatcherState.SYSTEM_BOOTING)
        self.store.set_system_boot(False)
        wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)

    def test_uninitialized_not_system_boot_with_procs(self):
        # tests the case where doctor arrives to an uninitialized system
        # that is not doing a system boot. HOWEVER, there are procs in the
        # UNSCHEDULED_PENDING state. This would likely only happen if the
        # PD died during system boot and recovered after the system boot flag
        # was turned off. Very small window, but possible.

        p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p0)
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p1)
        p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p2)

        restartable_procs = ["proc0", "proc1", "proc2"]
        self._run_in_thread()

        assert self.store.wait_initialized(timeout=10)
        self.assertEqual(self.store.get_pd_state(),
                         ProcessDispatcherState.OK)

        # check that pending processes were correctly rescheduled
        self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs))
        for proc in restartable_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.REQUESTED)

    def test_uninitialized_not_system_boot_without_procs(self):
        # tests the case where doctor arrives to an uninitialized system
        # that is not doing a system boot and has no UNSCHEDULED_PENDING procs.
        # this is likely a recovery from all-PD-workers failing and resuming in
        # a running system, or Zookeeper issue
        self._run_in_thread()

        assert self.store.wait_initialized(timeout=10)
        self.assertEqual(self.store.get_pd_state(),
                         ProcessDispatcherState.OK)

    def test_initialized_system_boot_with_procs(self):
        # tests the case where just the doctor dies in the middle of system boot
        # but after a doctor has already declared the system initialized. In this
        # case we have processes in the UNSCHEDULED_PENDING state that should be
        # rescheduled once system boot ends.

        self.store.set_system_boot(True)
        self.store.set_initialized()
        self.store.set_pd_state(ProcessDispatcherState.SYSTEM_BOOTING)

        p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p0)
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p1)
        p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p2)

        restartable_procs = ["proc0", "proc1", "proc2"]
        self._run_in_thread()

        # now end system boot
        self.store.set_system_boot(False)

        wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)

        # check that pending processes were correctly rescheduled
        self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs))
        for proc in restartable_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.REQUESTED)

    def test_initialized_system_boot_without_procs(self):
        # tests the case where just the doctor dies in the middle of system boot
        # but after a doctor has already declared the system initialized. In this
        # case we have no processes to schedule on system boot completion.

        self.store.set_system_boot(True)
        self.store.set_initialized()
        self.store.set_pd_state(ProcessDispatcherState.SYSTEM_BOOTING)

        self._run_in_thread()

        # now end system boot
        self.store.set_system_boot(False)

        wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)

    def test_initialized_not_system_boot(self):
        # recover into an already initialized and booted system. this is likely
        # a recovery from a doctor failure while the rest of the system was still
        # alive.
        self.store.set_initialized()
        self.store.set_pd_state(ProcessDispatcherState.OK)

        self._run_in_thread()

        # we have nothing really to check here, yet. but at least we can make sure
        # the process is cancellable.

    def test_monitor_thread(self):
        self._run_in_thread()

        assert self.store.wait_initialized(timeout=10)
        self.assertEqual(self.store.get_pd_state(),
                         ProcessDispatcherState.OK)

        self.assertIsNotNone(self.doctor.monitor)
        monitor_thread = self.doctor.monitor_thread
        self.assertIsNotNone(monitor_thread)
        self.assertTrue(monitor_thread.is_alive())

        # now cancel doctor. monitor should stop too
        self.doctor.cancel()
        wait(lambda: not monitor_thread.is_alive())

    def _setup_resource_monitor(self):
        self.monitor = ExecutionResourceMonitor(self.core, self.store)
        return self.monitor

    def _send_heartbeat(self, resource_id, node_id, timestamp):
        self.core.ee_heartbeat(resource_id, make_beat(node_id, timestamp=timestamp))

    def assert_monitor_cycle(self, expected_delay, resource_states=None):
        self.assertEqual(expected_delay, self.monitor.monitor_cycle())

        if resource_states:
            for resource_id, expected_state in resource_states.iteritems():
                found_state = self.store.get_resource(resource_id).state
                if found_state != expected_state:
                    self.fail("Resource %s state = %s. Expected %s" % (resource_id,
                        found_state, expected_state))

    def test_resource_monitor(self):
        t0 = datetime(2012, 3, 13, 9, 30, 0, tzinfo=UTC)
        mock_now = Mock()
        mock_now.return_value = t0

        def increment_now(seconds):
            t = mock_now.return_value + timedelta(seconds=seconds)
            mock_now.return_value = t
            log.debug("THE TIME IS NOW: %s", t)
            return t

        monitor = self._setup_resource_monitor()
        monitor._now_func = mock_now

        # before there are any resources, monitor should work but return a None delay
        self.assertIsNone(monitor.monitor_cycle())

        self.core.node_state("node1", domain_id_from_engine("engine1"),
            InstanceState.RUNNING)

        # 3 resources. all report in at t0
        r1, r2, r3 = "eeagent_1", "eeagent_2", "eeagent_3"
        self._send_heartbeat(r1, "node1", t0)
        self._send_heartbeat(r2, "node1", t0)
        self._send_heartbeat(r3, "node1", t0)

        states = {r1: ExecutionResourceState.OK, r2: ExecutionResourceState.OK,
                  r3: ExecutionResourceState.OK}

        self.assert_monitor_cycle(10, states)

        t1 = increment_now(5)  # :05
        # heartbeat comes in for r1 5 seconds later
        self._send_heartbeat(r1, "node1", t1)

        self.assert_monitor_cycle(5, states)

        increment_now(5)  # :10

        # no heartbeats for r2 and r3. they should be marked WARNING
        states[r2] = ExecutionResourceState.WARNING
        states[r3] = ExecutionResourceState.WARNING
        self.assert_monitor_cycle(5, states)

        increment_now(4)  # :14

        # r2 gets a heartbeat through, but its timestamp puts it still in the warning threshold
        self._send_heartbeat(r2, "node1", t0 + timedelta(seconds=1))

        self.assert_monitor_cycle(1, states)

        increment_now(6)  # :20

        # r1 should go warning, r3 should go missing
        states[r1] = ExecutionResourceState.WARNING
        states[r3] = ExecutionResourceState.MISSING
        self.assert_monitor_cycle(4, states)

        t2 = increment_now(3)  # :23
        self._send_heartbeat(r1, "node1", t2)
        states[r1] = ExecutionResourceState.OK
        self.assert_monitor_cycle(1, states)

        t3 = increment_now(2)  # :25
        self._send_heartbeat(r3, "node1", t3)
        states[r2] = ExecutionResourceState.MISSING
        states[r3] = ExecutionResourceState.OK
        self.assert_monitor_cycle(8, states)

        increment_now(5)  # :30
        # hearbeat r2 enough to go back to WARNING, but still late
        self.core.ee_heartbeat(r2, make_beat("node1", timestamp=t0 + timedelta(seconds=15)))
        self._send_heartbeat(r2, "node1", t0 + timedelta(seconds=15))
        states[r2] = ExecutionResourceState.WARNING
        self.assert_monitor_cycle(3, states)

        t4 = increment_now(5)  # :35
        # disable r2 and heartbeat r1 and r3 (heartbeats arrive late, but that's ok)
        self._send_heartbeat(r1, "node1", t4)
        self._send_heartbeat(r3, "node1", t4)
        self.core.resource_change_state(self.store.get_resource(r2),
            ExecutionResourceState.DISABLED)

        states[r2] = ExecutionResourceState.DISABLED
        self.assert_monitor_cycle(10, states)