class ProcessDispatcherSimpleAPIClient(object):

    # State to use when state returned from PD is None
    unknown_state = "400-PENDING"

    state_map = {
        ProcessStateEnum.SPAWN: '500-RUNNING',
        ProcessStateEnum.TERMINATE: '700-TERMINATED',
        ProcessStateEnum.ERROR: '850-FAILED'
    }

    def __init__(self, name, **kwargs):
        self.real_client = ProcessDispatcherServiceClient(to_name=name, **kwargs)
        self.event_pub = EventPublisher()

    def dispatch_process(self, upid, spec, subscribers, constraints=None,
                         immediate=False):

        name = spec.get('name')
        self.event_pub.publish_event(event_type="ProcessLifecycleEvent",
            origin=name, origin_type="DispatchedHAProcess",
            state=ProcessStateEnum.SPAWN)
        process_def = ProcessDefinition(name=name)
        process_def.executable = {'module': spec.get('module'),
                'class': spec.get('class')}

        process_def_id = self.real_client.create_process_definition(process_def)

        pid = self.real_client.create_process(process_def_id)

        process_schedule = ProcessSchedule()

        sched_pid = self.real_client.schedule_process(process_def_id,
                process_schedule, configuration={}, process_id=pid)

        proc = self.real_client.read_process(sched_pid)
        dict_proc = {'upid': proc.process_id,
                'state': self.state_map.get(proc.process_state, self.unknown_state),
                }
        return dict_proc

    def terminate_process(self, pid):
        return self.real_client.cancel_process(pid)

    def describe_processes(self):
        procs = self.real_client.list_processes()
        dict_procs = []
        for proc in procs:
            dict_proc = {'upid': proc.process_id,
                    'state': self.state_map.get(proc.process_state, self.unknown_state),
                    }
            dict_procs.append(dict_proc)
        return dict_procs
class ProcessDispatcherServiceIntTest(IonIntegrationTestCase):

    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')

        self.rr_cli  = ResourceRegistryServiceClient()
        self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)

        self.process_definition = ProcessDefinition(name='test_process')
        self.process_definition.executable = {'module': 'ion.services.cei.test.test_process_dispatcher',
                                              'class': 'TestProcess'}
        self.process_definition_id = self.pd_cli.create_process_definition(self.process_definition)

        self.waiter = ProcessStateWaiter()

    def tearDown(self):
        self.waiter.stop()

    def test_create_schedule_cancel(self):
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start(pid)

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, configuration={}, process_id=pid)
        self.assertEqual(pid, pid2)

        # verifies L4-CI-CEI-RQ141 and L4-CI-CEI-RQ142
        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, {})
        self.assertEqual(proc.process_state, ProcessStateEnum.RUNNING)

        # make sure process is readable directly from RR (mirrored)
        # verifies L4-CI-CEI-RQ63
        # verifies L4-CI-CEI-RQ64
        proc = self.rr_cli.read(pid)
        self.assertEqual(proc.process_id, pid)

        # now try communicating with the process to make sure it is really running
        test_client = TestClient()
        for i in range(5):
            self.assertEqual(i + 1, test_client.count(timeout=10))

        # verifies L4-CI-CEI-RQ147

        # kill the process and start it again
        self.pd_cli.cancel_process(pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)
        self.waiter.stop()

        oldpid = pid

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start(pid)

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, configuration={}, process_id=pid)
        self.assertEqual(pid, pid2)
        self.assertNotEqual(oldpid, pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        for i in range(5):
            self.assertEqual(i + 1, test_client.count(timeout=10))

        # kill the process for good
        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)

    def test_schedule_with_config(self):

        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start(pid)

        # verifies L4-CI-CEI-RQ66

        # feed in a string that the process will return -- verifies that
        # configuration actually makes it to the instantiated process
        test_response = uuid.uuid4().hex
        configuration = {"test_response" : test_response}

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, configuration=configuration, process_id=pid)
        self.assertEqual(pid, pid2)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        test_client = TestClient()

        # verifies L4-CI-CEI-RQ139
        # assure that configuration block (which can contain inputs, outputs,
        # and arbitrary config) 1) makes it to the process and 2) is returned
        # in process queries

        self.assertEqual(test_client.query(), test_response)

        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, configuration)

        # kill the process for good
        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)

    def test_schedule_bad_config(self):

        process_schedule = ProcessSchedule()

        # a non-JSON-serializable IonObject
        o = ProcessTarget()

        with self.assertRaises(BadRequest) as ar:
            self.pd_cli.schedule_process(self.process_definition_id,
                process_schedule, configuration={"bad": o})
        self.assertTrue(ar.exception.message.startswith("bad configuration"))

    def test_create_invalid_definition(self):
        # create process definition missing module and class
        # verifies L4-CI-CEI-RQ137
        executable = dict(url="http://somewhere.com/something.py")
        definition = ProcessDefinition(name="test_process", executable=executable)
        with self.assertRaises(BadRequest) as ar:
            self.pd_cli.create_process_definition(definition)
class ProcessDispatcherServiceIntTest(IonIntegrationTestCase):

    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')

        self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)

        self.process_definition = ProcessDefinition(name='test_process')
        self.process_definition.executable = {'module': 'ion.services.cei.test.test_process_dispatcher',
                                              'class': 'TestProcess'}
        self.process_definition_id = self.pd_cli.create_process_definition(self.process_definition)
        self.event_queue = queue.Queue()

        self.event_sub = None

    def tearDown(self):
        if self.event_sub:
            self.event_sub.stop()
        self._stop_container()

    def _event_callback(self, event, *args, **kwargs):
        self.event_queue.put(event)

    def subscribe_events(self, origin):
        self.event_sub = EventSubscriber(event_type="ProcessLifecycleEvent",
            callback=self._event_callback, origin=origin, origin_type="DispatchedProcess")
        self.event_sub.start()

    def await_state_event(self, pid, state):
        event = self.event_queue.get(timeout=5)
        log.debug("Got event: %s", event)
        self.assertEqual(event.origin, pid)
        self.assertEqual(event.state, state)
        return event

    def test_create_schedule_cancel(self):
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.subscribe_events(pid)

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, configuration={}, process_id=pid)
        self.assertEqual(pid, pid2)

        self.await_state_event(pid, ProcessStateEnum.SPAWN)

        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, {})
        self.assertEqual(proc.process_state, ProcessStateEnum.SPAWN)

        # now try communicating with the process to make sure it is really running
        test_client = TestClient()
        for i in range(5):
            self.assertEqual(i + 1, test_client.count(timeout=10))

        # kill the process and start it again
        self.pd_cli.cancel_process(pid)

        self.await_state_event(pid, ProcessStateEnum.TERMINATE)

        oldpid = pid

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.subscribe_events(pid)

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, configuration={}, process_id=pid)
        self.assertEqual(pid, pid2)
        self.assertNotEqual(oldpid, pid)

        self.await_state_event(pid, ProcessStateEnum.SPAWN)

        for i in range(5):
            self.assertEqual(i + 1, test_client.count(timeout=10))

        # kill the process for good
        self.pd_cli.cancel_process(pid)
        self.await_state_event(pid, ProcessStateEnum.TERMINATE)

    def test_schedule_bad_config(self):

        process_schedule = ProcessSchedule()

        # a non-JSON-serializable IonObject
        o = ProcessTarget()

        with self.assertRaises(BadRequest) as ar:
            self.pd_cli.schedule_process(self.process_definition_id,
                process_schedule, configuration={"bad": o})
        self.assertTrue(ar.exception.message.startswith("bad configuration"))
class HAProcessControl(object):

    def __init__(self, pd_name, resource_registry, service_id, callback=None, logprefix=""):
        self.pd_name = pd_name
        self.resource_registry = resource_registry
        self.service_id = service_id
        self.callback = callback
        if callback and not callable(callback):
            raise ValueError("callback is not callable")
        self.logprefix = logprefix

        self.client = ProcessDispatcherServiceClient(to_name=pd_name)
        self.event_sub = EventSubscriber(event_type="ProcessLifecycleEvent",
            callback=self._event_callback, origin_type="DispatchedProcess",
            auto_delete=True)

        self.processes = {}

    def start(self):
        service = self.resource_registry.read(self.service_id)
        process_assocs = self.resource_registry.find_associations(service, "hasProcess")

        for process_assoc in process_assocs:
            process_id = process_assoc.o
            if process_id:
                try:
                    process = self.client.read_process(process_id)
                except NotFound:
                    log.debug("%sService was associated with process %s, which is unknown to PD. ignoring.",
                        self.logprefix, process_id)
                    continue

                state = process.process_state
                state_str = ProcessStateEnum._str_map.get(state, str(state))

                self.processes[process.process_id] = _process_dict_from_object(process)
                log.info("%srecovered process %s state=%s", self.logprefix, process_id, state_str)
        self.event_sub.start()

    def stop(self):
        self.event_sub.stop()

    def get_managed_upids(self):
        return self.processes.keys()

    def _event_callback(self, event, *args, **kwargs):
        if not event:
            return

        try:
            self._inner_event_callback(event)
        except (KeyboardInterrupt, SystemExit):
            raise
        except:
            log.exception("%sException in event handler. This is a bug!", self.logprefix)

    def _inner_event_callback(self, event):
        process_id = event.origin
        state = event.state
        state_str = ProcessStateEnum._str_map.get(state, str(state))
        if not (process_id and process_id in self.processes):
            # we receive events for all processes but ignore most
            return

        process = None
        for _ in range(3):
            try:
                process = self.client.read_process(process_id)
                break
            except Timeout:
                log.warn("Timeout trying to read process from Process Dispatcher!", exc_info=True)
                pass  # retry
            except NotFound:
                break

        if process:
            log.info("%sreceived process %s state=%s", self.logprefix, process_id, state_str)

            # replace the cached data about this process
            self.processes[process_id] = _process_dict_from_object(process)

        else:
            log.warn("%sReceived process %s event but failed to read from Process Dispatcher",
                self.logprefix, process_id)

            #XXX better approach here? we at least have the state from the event,
            # so sticking that on cached process. We could miss other important
            # data like hostname however.
            self.processes[process_id]['state'] = process_state_to_pd_core(state)

        if self.callback:
            try:
                self.callback()
            except (KeyboardInterrupt, SystemExit):
                raise
            except:
                e = sys.exc_info()[0]
                log.warn("%sError in HAAgent callback: %s", self.logprefix, e, exc_info=True)

    def _associate_process(self, process):
        try:
            self.resource_registry.create_association(self.service_id,
                "hasProcess", process.process_id)
        except Exception:
            log.exception("Couldn't associate service %s to process %s" % (self.service_id, process.process_id))

    def schedule_process(self, pd_name, process_definition_id, **kwargs):

        if pd_name != self.pd_name:
            raise Exception("schedule_process request received for unknown PD: %s" % pd_name)

        # figure out if there is an existing PID which can be reused
        found_upid = None
        for process in self.processes.values():
            upid = process.get('upid')
            state = process.get('state')
            if not (upid and state):
                continue

            if state in CoreProcessState.TERMINAL_STATES:
                found_upid = upid

        if found_upid:
            upid = found_upid
            proc = self.client.read_process(upid)

        else:
            # otherwise create a new process and associate
            upid = self.client.create_process(process_definition_id)

            # note: if the HAAgent fails between the create call above and the
            # associate call below, there may be orphaned Process objects. These
            # processes will not however be running, so are largely harmless.
            proc = self.client.read_process(upid)
            self._associate_process(proc)

        process_schedule = _get_process_schedule(**kwargs)
        configuration = kwargs.get('configuration')

        # cheat and roll the process state to REQUESTED before we actually
        # schedule it. this is in-memory only, so should be harmless. This
        # avoids a race between this scheduling process and the event
        # subscriber.
        proc.process_state = ProcessStateEnum.REQUESTED
        self.processes[upid] = _process_dict_from_object(proc)

        self.client.schedule_process(process_definition_id, process_schedule,
            configuration=configuration, process_id=upid)
        return upid

    def terminate_process(self, pid):
        return self.client.cancel_process(pid)

    def get_all_processes(self):
        processes = deepcopy(self.processes.values())
        return {self.pd_name: processes}

    def reload_processes(self):
        for process_id, process_dict in self.processes.items():
            try:
                process = self.client.read_process(process_id)
            except Exception:
                log.warn("%sFailed to read process %s from PD. Will retry later.",
                    self.logprefix, process_id, exc_info=True)
                continue
            new_process_dict = _process_dict_from_object(process)
            if new_process_dict['state'] != process_dict['state']:
                log.warn("%sUpdating process %s record manually. we may have missed an event?",
                    self.logprefix, process_id)
                self.processes[process_id] = new_process_dict
class ProcessDispatcherEEAgentIntTest(ProcessDispatcherServiceIntTest):
    """Run the basic int tests again, with a different environment
    """

    def setUp(self):
        self.dashi = None
        self._start_container()
        from pyon.public import CFG

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self.container = self.container_client._get_container_instance()

        app = dict(name="process_dispatcher", processapp=("process_dispatcher",
                               "ion.services.cei.process_dispatcher_service",
                               "ProcessDispatcherService"))
        self.container.start_app(app, config=pd_config)

        self.rr_cli = self.container.resource_registry

        self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)

        self.process_definition = ProcessDefinition(name='test_process')
        self.process_definition.executable = {'module': 'ion.services.cei.test.test_process_dispatcher',
                                              'class': 'TestProcess'}
        self.process_definition_id = self.pd_cli.create_process_definition(self.process_definition)

        self._eea_pids = []
        self._eea_pid_to_resource_id = {}
        self._eea_pid_to_persistence_dir = {}
        self._tmpdirs = []

        self.dashi = get_dashi(uuid.uuid4().hex,
            pd_config['processdispatcher']['dashi_uri'],
            pd_config['processdispatcher']['dashi_exchange'],
            sysname=CFG.get_safe("dashi.sysname")
            )

        #send a fake node_state message to PD's dashi binding.
        self.node1_id = uuid.uuid4().hex
        self._send_node_state("engine1", self.node1_id)
        self._initial_eea_pid = self._start_eeagent(self.node1_id)

        self.waiter = ProcessStateWaiter()

    def _send_node_state(self, engine_id, node_id=None):
        node_id = node_id or uuid.uuid4().hex
        node_state = dict(node_id=node_id, state=InstanceState.RUNNING,
            domain_id=domain_id_from_engine(engine_id))
        self.dashi.fire(get_pd_dashi_name(), "node_state", args=node_state)

    def _start_eeagent(self, node_id, resource_id=None, persistence_dir=None):
        if not persistence_dir:
            persistence_dir = tempfile.mkdtemp()
            self._tmpdirs.append(persistence_dir)
        resource_id = resource_id or uuid.uuid4().hex
        agent_config = _get_eeagent_config(node_id, persistence_dir,
            resource_id=resource_id)
        pid = self.container_client.spawn_process(name="eeagent",
            module="ion.agents.cei.execution_engine_agent",
            cls="ExecutionEngineAgent", config=agent_config)
        log.info('Agent pid=%s.', str(pid))
        self._eea_pids.append(pid)
        self._eea_pid_to_resource_id[pid] = resource_id
        self._eea_pid_to_persistence_dir[pid] = persistence_dir
        return pid

    def _kill_eeagent(self, pid):
        self.assertTrue(pid in self._eea_pids)
        self.container.terminate_process(pid)
        self._eea_pids.remove(pid)
        del self._eea_pid_to_resource_id[pid]
        del self._eea_pid_to_persistence_dir[pid]

    def tearDown(self):
        for pid in list(self._eea_pids):
            self._kill_eeagent(pid)
        for d in self._tmpdirs:
            shutil.rmtree(d)

        self.waiter.stop()
        if self.dashi:
            self.dashi.cancel()

    def test_requested_ee(self):

        # request non-default engine

        process_target = ProcessTarget(execution_engine_id="engine2")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start()

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.WAITING)

        # request unknown engine, with NEVER queuing mode. The request
        # should be rejected.
        # verifies L4-CI-CEI-RQ52

        process_target = ProcessTarget(execution_engine_id="not-a-real-ee")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        rejected_pid = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=rejected_pid)

        self.waiter.await_state_event(rejected_pid, ProcessStateEnum.REJECTED)

        # now add a node and eeagent for engine2. original process should leave
        # queue and start running
        node2_id = uuid.uuid4().hex
        self._send_node_state("engine2", node2_id)
        self._start_eeagent(node2_id)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        # spawn another process. it should start immediately.

        process_target = ProcessTarget(execution_engine_id="engine2")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        pid2 = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid2)

        self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING)

        # one more with node exclusive

        process_target = ProcessTarget(execution_engine_id="engine2",
            node_exclusive="hats")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        pid3 = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid3)

        self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING)

        # kill the processes for good
        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED)

    def test_node_exclusive(self):

        # the node_exclusive constraint is used to ensure multiple processes
        # of the same "kind" each get a VM exclusive of each other. Other
        # processes may run on these VMs, just not processes with the same
        # node_exclusive tag. Since we cannot directly query the contents
        # of each node in this test, we prove the capability by scheduling
        # processes one by one and checking their state.

        # verifies L4-CI-CEI-RQ121
        # verifies L4-CI-CEI-RQ57

        # first off, setUp() created a single node and eeagent.
        # We schedule two processes with the same "abc" node_exclusive
        # tag. Since there is only one node, the first process should run
        # and the second should be queued.

        process_target = ProcessTarget(execution_engine_id="engine1")
        process_target.node_exclusive = "abc"
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        pid1 = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start()

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid1)

        self.waiter.await_state_event(pid1, ProcessStateEnum.RUNNING)

        pid2 = self.pd_cli.create_process(self.process_definition_id)
        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.WAITING)

        # now demonstrate that the node itself is not full by launching
        # a third process without a node_exclusive tag -- it should start
        # immediately

        process_target.node_exclusive = None
        pid3 = self.pd_cli.create_process(self.process_definition_id)
        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING)

        # finally, add a second node to the engine. pid2 should be started
        # since there is an exclusive "abc" node free.
        node2_id = uuid.uuid4().hex
        self._send_node_state("engine1", node2_id)
        self._start_eeagent(node2_id)
        self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING)

        # kill the processes for good
        self.pd_cli.cancel_process(pid1)
        self.waiter.await_state_event(pid1, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED)

    def test_code_download(self):
        # create a process definition that has no URL; only module and class.
        process_definition_no_url = ProcessDefinition(name='test_process_nodownload')
        process_definition_no_url.executable = {'module': 'ion.my.test.process',
                'class': 'TestProcess'}
        process_definition_id_no_url = self.pd_cli.create_process_definition(process_definition_no_url)

        # create another that has a URL of the python file (this very file)
        # verifies L4-CI-CEI-RQ114
        url = "file://%s" % os.path.join(os.path.dirname(__file__), 'test_process_dispatcher.py')
        process_definition = ProcessDefinition(name='test_process_download')
        process_definition.executable = {'module': 'ion.my.test.process',
                'class': 'TestProcess', 'url': url}
        process_definition_id = self.pd_cli.create_process_definition(process_definition)

        process_target = ProcessTarget()
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        self.waiter.start()

        # Test a module with no download fails
        pid_no_url = self.pd_cli.create_process(process_definition_id_no_url)

        self.pd_cli.schedule_process(process_definition_id_no_url,
            process_schedule, process_id=pid_no_url)

        self.waiter.await_state_event(pid_no_url, ProcessStateEnum.FAILED)

        # Test a module with a URL runs
        pid = self.pd_cli.create_process(process_definition_id)

        self.pd_cli.schedule_process(process_definition_id,
            process_schedule, process_id=pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

    def _add_test_process(self, restart_mode=None):
        process_schedule = ProcessSchedule()
        if restart_mode is not None:
            process_schedule.restart_mode = restart_mode
        pid = self.pd_cli.create_process(self.process_definition_id)

        pid_listen_name = "PDtestproc_%s" % uuid.uuid4().hex
        config = {'process': {'listen_name': pid_listen_name}}

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid, configuration=config)

        client = TestClient(to_name=pid_listen_name)
        return pid, client

    def test_restart(self):
        self.waiter.start()

        restartable_pids = []
        nonrestartable_pids = []
        clients = {}
        # start 10 processes with RestartMode.ALWAYS
        for _ in range(10):
            pid, client = self._add_test_process(ProcessRestartMode.ALWAYS)
            restartable_pids.append(pid)
            clients[pid] = client

        # and 10 processes with RestartMode.ABNORMAL
        for _ in range(10):
            pid, client = self._add_test_process(ProcessRestartMode.ABNORMAL)
            restartable_pids.append(pid)
            clients[pid] = client

        # and 10 with RestartMode.NEVER
        for _ in range(10):
            pid, client = self._add_test_process(ProcessRestartMode.NEVER)
            nonrestartable_pids.append(pid)
            clients[pid] = client

        all_pids = restartable_pids + nonrestartable_pids

        self.waiter.await_many_state_events(all_pids, ProcessStateEnum.RUNNING)

        for pid in all_pids:
            client = clients[pid]
            self.assertFalse(client.is_restart())
            self.assertEqual(client.count(), 1)

        # now kill the whole eeagent and restart it. processes should
        # show up as FAILED in the next heartbeat.
        resource_id = self._eea_pid_to_resource_id[self._initial_eea_pid]
        persistence_dir = self._eea_pid_to_persistence_dir[self._initial_eea_pid]
        log.debug("Restarting eeagent %s", self._initial_eea_pid)
        self._kill_eeagent(self._initial_eea_pid)

        # manually kill the processes to simulate a real container failure
        for pid in all_pids:
            self.container.terminate_process(pid)

        self._start_eeagent(self.node1_id, resource_id=resource_id,
            persistence_dir=persistence_dir)

        # wait for restartables to restart
        self.waiter.await_many_state_events(restartable_pids, ProcessStateEnum.RUNNING)

        # query the processes again. it should have restart mode config
        for pid in restartable_pids:
            client = clients[pid]
            self.assertTrue(client.is_restart())
            self.assertEqual(client.count(), 1)

        # meanwhile some procs should not have restarted
        for pid in nonrestartable_pids:
            proc = self.pd_cli.read_process(pid)
            self.assertEqual(proc.process_state, ProcessStateEnum.FAILED)

        # guard against extraneous events we were receiving as part of a bug:
        # processes restarting again after they were already restarted
        self.waiter.await_nothing(timeout=5)

    def test_idempotency(self):
        # ensure every operation can be safely retried
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS

        proc_name = 'myreallygoodname'
        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start(pid)

        # note: if we import UNSCHEDULED state into ProcessStateEnum,
        # this assertion will need to change.
        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_state, ProcessStateEnum.REQUESTED)

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, configuration={}, process_id=pid, name=proc_name)
        self.assertEqual(pid, pid2)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        # repeating schedule is harmless
        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, configuration={}, process_id=pid, name=proc_name)
        self.assertEqual(pid, pid2)

        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, {})
        self.assertEqual(proc.process_state, ProcessStateEnum.RUNNING)

        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)

        # repeating cancel is harmless
        self.pd_cli.cancel_process(pid)
        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, {})
        self.assertEqual(proc.process_state, ProcessStateEnum.TERMINATED)
Exemplo n.º 6
0
class Launcher(object):
    """
    Helper for launching platform and instrument agent processes.
    """

    def __init__(self, timeout_spawn):
        """
        @param timeout_spawn    Default timeout in secs for the RUNNING event.
        """
        self._timeout_spawn = timeout_spawn
        self._pd_client = ProcessDispatcherServiceClient()
        self._agent_launcher = AgentLauncher(self._pd_client)

    def destroy(self):
        if self._pd_client:
            self._pd_client.close()
            self._pd_client = None
            self._agent_launcher = None

    def launch_platform(self, agt_id, agent_config, timeout_spawn=None):
        """
        Launches a platform agent.

        @param agt_id           Some ID mainly used for logging
        @param agent_config     Agent configuration
        @param timeout_spawn    Timeout in secs for the RUNNING event (by
                                default, the value given in constructor).
                                If None or zero, no wait is performed.

        @return process ID
        """
        timeout_spawn = timeout_spawn or self._timeout_spawn
        log.debug("launch_platform: agt_id=%r, timeout_spawn=%s", agt_id, timeout_spawn)

        name = 'PlatformAgent_%s' % agt_id
        pdef = ProcessDefinition(name=name)
        pdef.executable = {
            'module': 'ion.agents.platform.platform_agent',
            'class':  'PlatformAgent'
        }

        pdef_id = self._pd_client.create_process_definition(process_definition=pdef)

        pid = self._agent_launcher.launch(agent_config, pdef_id)

        if timeout_spawn:
            log.debug("launch_platform: agt_id=%r: waiting for RUNNING", agt_id)
            self._agent_launcher.await_launch(timeout_spawn)
            log.debug("launch_platform: agt_id=%r: RUNNING", agt_id)

        return pid

    def launch_instrument(self, agt_id, agent_config, timeout_spawn=None):
        """
        Launches an instrument agent.

        @param agt_id           Some ID mainly used for logging
        @param agent_config     Agent configuration
        @param timeout_spawn    Timeout in secs for the RUNNING event (by
                                default, the value given in constructor).
                                If None or zero, no wait is performed.

        @return process ID
        """
        timeout_spawn = timeout_spawn or self._timeout_spawn
        log.debug("launch_instrument: agt_id=%r, timeout_spawn=%s", agt_id, timeout_spawn)

        name = 'InstrumentAgent_%s' % agt_id
        pdef = ProcessDefinition(name=name)
        pdef.executable = {
            'module': 'ion.agents.instrument.instrument_agent',
            'class':  'InstrumentAgent'
        }

        pdef_id = self._pd_client.create_process_definition(process_definition=pdef)

        pid = self._agent_launcher.launch(agent_config, pdef_id)

        if timeout_spawn:
            log.debug("launch_instrument: agt_id=%r: waiting for RUNNING", agt_id)
            self._agent_launcher.await_launch(timeout_spawn)
            log.debug("launch_instrument: agt_id=%r: RUNNING", agt_id)

        return pid

    def cancel_process(self, pid, timeout_cancel=None):
        """
        Helper to terminate a process
        """
        pinfo = self._pd_client.read_process(pid)
        if pinfo.process_state != ProcessStateEnum.RUNNING:
            log.debug("cancel_process: pid=%r is not RUNNING", pid)
            return

        log.debug("cancel_process: canceling pid=%r", pid)
        self._pd_client.cancel_process(pid)

        if timeout_cancel:
            log.debug("waiting %s seconds for preocess to cancel", timeout_cancel)
            psg = ProcessStateGate(self._pd_client.read_process, pid,
                                   ProcessStateEnum.TERMINATED)
            if not psg.await(timeout_cancel):
                log.debug("Process %r failed to get to TERMINATED in %s seconds",
                          pid, timeout_cancel)
class HAProcessControl(object):

    def __init__(self, pd_name, resource_registry, service_id, callback=None, logprefix=""):
        self.pd_name = pd_name
        self.resource_registry = resource_registry
        self.service_id = service_id
        self.callback = callback
        if callback and not callable(callback):
            raise ValueError("callback is not callable")
        self.logprefix = logprefix

        self.client = ProcessDispatcherServiceClient(to_name=pd_name)
        self.event_sub = EventSubscriber(event_type="ProcessLifecycleEvent",
            callback=self._event_callback, origin_type="DispatchedProcess",
            auto_delete=True)

        self.processes = {}

    def start(self):
        service = self.resource_registry.read(self.service_id)
        process_assocs = self.resource_registry.find_associations(service, "hasProcess")

        for process_assoc in process_assocs:
            process_id = process_assoc.o
            if process_id:
                try:
                    process = self.client.read_process(process_id)
                except NotFound:
                    log.debug("%sService was associated with process %s, which is unknown to PD. ignoring.",
                        self.logprefix, process_id)
                    continue

                state = process.process_state
                state_str = ProcessStateEnum._str_map.get(state, str(state))

                self.processes[process.process_id] = _process_dict_from_object(process)
                log.info("%srecovered process %s state=%s", self.logprefix, process_id, state_str)
        self.event_sub.start()

    def stop(self):
        self.event_sub.stop()

    def get_managed_upids(self):
        return self.processes.keys()

    def _event_callback(self, event, *args, **kwargs):
        if not event:
            return

        try:
            self._inner_event_callback(event)
        except Exception:
            log.exception("%sException in event handler. This is a bug!", self.logprefix)

    def _inner_event_callback(self, event):
        process_id = event.origin
        state = event.state
        state_str = ProcessStateEnum._str_map.get(state, str(state))
        if not (process_id and process_id in self.processes):
            # we receive events for all processes but ignore most
            return

        process = None
        for _ in range(3):
            try:
                process = self.client.read_process(process_id)
                break
            except Timeout:
                log.warn("Timeout trying to read process from Process Dispatcher!", exc_info=True)
                pass  # retry
            except NotFound:
                break

        if process:
            log.info("%sreceived process %s state=%s", self.logprefix, process_id, state_str)

            # replace the cached data about this process
            self.processes[process_id] = _process_dict_from_object(process)

        else:
            log.warn("%sReceived process %s event but failed to read from Process Dispatcher",
                self.logprefix, process_id)

            #XXX better approach here? we at least have the state from the event,
            # so sticking that on cached process. We could miss other important
            # data like hostname however.
            self.processes[process_id]['state'] = process_state_to_pd_core(state)

        if self.callback:
            try:
                self.callback()
            except Exception, e:
                log.warn("%sError in HAAgent callback: %s", self.logprefix, e, exc_info=True)
class ProcessDispatcherEEAgentIntTest(ProcessDispatcherServiceIntTest):
    """Run the basic int tests again, with a different environment
    """
    def setUp(self):
        self.dashi = None
        self._start_container()
        from pyon.public import CFG

        self.container_client = ContainerAgentClient(node=self.container.node,
                                                     name=self.container.name)
        self.container = self.container_client._get_container_instance()

        app = dict(name="process_dispatcher",
                   processapp=("process_dispatcher",
                               "ion.services.cei.process_dispatcher_service",
                               "ProcessDispatcherService"))
        self.container.start_app(app, config=pd_config)

        self.rr_cli = self.container.resource_registry

        self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)

        self.process_definition = ProcessDefinition(name='test_process')
        self.process_definition.executable = {
            'module': 'ion.services.cei.test.test_process_dispatcher',
            'class': 'TestProcess'
        }
        self.process_definition_id = self.pd_cli.create_process_definition(
            self.process_definition)

        self._eea_pids = []
        self._eea_pid_to_resource_id = {}
        self._eea_pid_to_persistence_dir = {}
        self._tmpdirs = []

        self.dashi = get_dashi(
            uuid.uuid4().hex,
            pd_config['processdispatcher']['dashi_uri'],
            pd_config['processdispatcher']['dashi_exchange'],
            sysname=CFG.get_safe("dashi.sysname"))

        #send a fake node_state message to PD's dashi binding.
        self.node1_id = uuid.uuid4().hex
        self._send_node_state("engine1", self.node1_id)
        self._initial_eea_pid = self._start_eeagent(self.node1_id)

        self.waiter = ProcessStateWaiter()

    def _send_node_state(self, engine_id, node_id=None):
        node_id = node_id or uuid.uuid4().hex
        node_state = dict(node_id=node_id,
                          state=InstanceState.RUNNING,
                          domain_id=domain_id_from_engine(engine_id))
        self.dashi.fire(get_pd_dashi_name(), "node_state", args=node_state)

    def _start_eeagent(self, node_id, resource_id=None, persistence_dir=None):
        if not persistence_dir:
            persistence_dir = tempfile.mkdtemp()
            self._tmpdirs.append(persistence_dir)
        resource_id = resource_id or uuid.uuid4().hex
        agent_config = _get_eeagent_config(node_id,
                                           persistence_dir,
                                           resource_id=resource_id)
        pid = self.container_client.spawn_process(
            name="eeagent",
            module="ion.agents.cei.execution_engine_agent",
            cls="ExecutionEngineAgent",
            config=agent_config)
        log.info('Agent pid=%s.', str(pid))
        self._eea_pids.append(pid)
        self._eea_pid_to_resource_id[pid] = resource_id
        self._eea_pid_to_persistence_dir[pid] = persistence_dir
        return pid

    def _kill_eeagent(self, pid):
        self.assertTrue(pid in self._eea_pids)
        self.container.terminate_process(pid)
        self._eea_pids.remove(pid)
        del self._eea_pid_to_resource_id[pid]
        del self._eea_pid_to_persistence_dir[pid]

    def tearDown(self):
        for pid in list(self._eea_pids):
            self._kill_eeagent(pid)
        for d in self._tmpdirs:
            shutil.rmtree(d)

        self.waiter.stop()
        if self.dashi:
            self.dashi.cancel()

    def test_requested_ee(self):

        # request non-default engine

        process_target = ProcessTarget(execution_engine_id="engine2")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start()

        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.WAITING)

        # request unknown engine, with NEVER queuing mode. The request
        # should be rejected.
        # verifies L4-CI-CEI-RQ52

        process_target = ProcessTarget(execution_engine_id="not-a-real-ee")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        rejected_pid = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=rejected_pid)

        self.waiter.await_state_event(rejected_pid, ProcessStateEnum.REJECTED)

        # now add a node and eeagent for engine2. original process should leave
        # queue and start running
        node2_id = uuid.uuid4().hex
        self._send_node_state("engine2", node2_id)
        self._start_eeagent(node2_id)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        # spawn another process. it should start immediately.

        process_target = ProcessTarget(execution_engine_id="engine2")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        pid2 = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=pid2)

        self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING)

        # one more with node exclusive

        process_target = ProcessTarget(execution_engine_id="engine2",
                                       node_exclusive="hats")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        pid3 = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=pid3)

        self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING)

        # kill the processes for good
        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED)

    def test_node_exclusive(self):

        # the node_exclusive constraint is used to ensure multiple processes
        # of the same "kind" each get a VM exclusive of each other. Other
        # processes may run on these VMs, just not processes with the same
        # node_exclusive tag. Since we cannot directly query the contents
        # of each node in this test, we prove the capability by scheduling
        # processes one by one and checking their state.

        # verifies L4-CI-CEI-RQ121
        # verifies L4-CI-CEI-RQ57

        # first off, setUp() created a single node and eeagent.
        # We schedule two processes with the same "abc" node_exclusive
        # tag. Since there is only one node, the first process should run
        # and the second should be queued.

        process_target = ProcessTarget(execution_engine_id="engine1")
        process_target.node_exclusive = "abc"
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        pid1 = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start()

        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=pid1)

        self.waiter.await_state_event(pid1, ProcessStateEnum.RUNNING)

        pid2 = self.pd_cli.create_process(self.process_definition_id)
        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.WAITING)

        # now demonstrate that the node itself is not full by launching
        # a third process without a node_exclusive tag -- it should start
        # immediately

        process_target.node_exclusive = None
        pid3 = self.pd_cli.create_process(self.process_definition_id)
        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING)

        # finally, add a second node to the engine. pid2 should be started
        # since there is an exclusive "abc" node free.
        node2_id = uuid.uuid4().hex
        self._send_node_state("engine1", node2_id)
        self._start_eeagent(node2_id)
        self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING)

        # kill the processes for good
        self.pd_cli.cancel_process(pid1)
        self.waiter.await_state_event(pid1, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED)

    def test_code_download(self):
        # create a process definition that has no URL; only module and class.
        process_definition_no_url = ProcessDefinition(
            name='test_process_nodownload')
        process_definition_no_url.executable = {
            'module': 'ion.my.test.process',
            'class': 'TestProcess'
        }
        process_definition_id_no_url = self.pd_cli.create_process_definition(
            process_definition_no_url)

        # create another that has a URL of the python file (this very file)
        # verifies L4-CI-CEI-RQ114
        url = "file://%s" % os.path.join(os.path.dirname(__file__),
                                         'test_process_dispatcher.py')
        process_definition = ProcessDefinition(name='test_process_download')
        process_definition.executable = {
            'module': 'ion.my.test.process',
            'class': 'TestProcess',
            'url': url
        }
        process_definition_id = self.pd_cli.create_process_definition(
            process_definition)

        process_target = ProcessTarget()
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        self.waiter.start()

        # Test a module with no download fails
        pid_no_url = self.pd_cli.create_process(process_definition_id_no_url)

        self.pd_cli.schedule_process(process_definition_id_no_url,
                                     process_schedule,
                                     process_id=pid_no_url)

        self.waiter.await_state_event(pid_no_url, ProcessStateEnum.FAILED)

        # Test a module with a URL runs
        pid = self.pd_cli.create_process(process_definition_id)

        self.pd_cli.schedule_process(process_definition_id,
                                     process_schedule,
                                     process_id=pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

    def _add_test_process(self, restart_mode=None):
        process_schedule = ProcessSchedule()
        if restart_mode is not None:
            process_schedule.restart_mode = restart_mode
        pid = self.pd_cli.create_process(self.process_definition_id)

        pid_listen_name = "PDtestproc_%s" % uuid.uuid4().hex
        config = {'process': {'listen_name': pid_listen_name}}

        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=pid,
                                     configuration=config)

        client = TestClient(to_name=pid_listen_name)
        return pid, client

    def test_restart(self):
        self.waiter.start()

        restartable_pids = []
        nonrestartable_pids = []
        clients = {}
        # start 10 processes with RestartMode.ALWAYS
        for _ in range(10):
            pid, client = self._add_test_process(ProcessRestartMode.ALWAYS)
            restartable_pids.append(pid)
            clients[pid] = client

        # and 10 processes with RestartMode.ABNORMAL
        for _ in range(10):
            pid, client = self._add_test_process(ProcessRestartMode.ABNORMAL)
            restartable_pids.append(pid)
            clients[pid] = client

        # and 10 with RestartMode.NEVER
        for _ in range(10):
            pid, client = self._add_test_process(ProcessRestartMode.NEVER)
            nonrestartable_pids.append(pid)
            clients[pid] = client

        all_pids = restartable_pids + nonrestartable_pids

        self.waiter.await_many_state_events(all_pids, ProcessStateEnum.RUNNING)

        for pid in all_pids:
            client = clients[pid]
            self.assertFalse(client.is_restart())
            self.assertEqual(client.count(), 1)

        # now kill the whole eeagent and restart it. processes should
        # show up as FAILED in the next heartbeat.
        resource_id = self._eea_pid_to_resource_id[self._initial_eea_pid]
        persistence_dir = self._eea_pid_to_persistence_dir[
            self._initial_eea_pid]
        log.debug("Restarting eeagent %s", self._initial_eea_pid)
        self._kill_eeagent(self._initial_eea_pid)

        # manually kill the processes to simulate a real container failure
        for pid in all_pids:
            self.container.terminate_process(pid)

        self._start_eeagent(self.node1_id,
                            resource_id=resource_id,
                            persistence_dir=persistence_dir)

        # wait for restartables to restart
        self.waiter.await_many_state_events(restartable_pids,
                                            ProcessStateEnum.RUNNING)

        # query the processes again. it should have restart mode config
        for pid in restartable_pids:
            client = clients[pid]
            self.assertTrue(client.is_restart())
            self.assertEqual(client.count(), 1)

        # meanwhile some procs should not have restarted
        for pid in nonrestartable_pids:
            proc = self.pd_cli.read_process(pid)
            self.assertEqual(proc.process_state, ProcessStateEnum.FAILED)

        # guard against extraneous events we were receiving as part of a bug:
        # processes restarting again after they were already restarted
        self.waiter.await_nothing(timeout=5)

    def test_idempotency(self):
        # ensure every operation can be safely retried
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS

        proc_name = 'myreallygoodname'
        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start(pid)

        # note: if we import UNSCHEDULED state into ProcessStateEnum,
        # this assertion will need to change.
        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_state, ProcessStateEnum.REQUESTED)

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
                                            process_schedule,
                                            configuration={},
                                            process_id=pid,
                                            name=proc_name)
        self.assertEqual(pid, pid2)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        # repeating schedule is harmless
        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
                                            process_schedule,
                                            configuration={},
                                            process_id=pid,
                                            name=proc_name)
        self.assertEqual(pid, pid2)

        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, {})
        self.assertEqual(proc.process_state, ProcessStateEnum.RUNNING)

        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)

        # repeating cancel is harmless
        self.pd_cli.cancel_process(pid)
        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, {})
        self.assertEqual(proc.process_state, ProcessStateEnum.TERMINATED)
class ProcessDispatcherServiceIntTest(IonIntegrationTestCase):
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')

        self.rr_cli = ResourceRegistryServiceClient()
        self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)

        self.process_definition = ProcessDefinition(name='test_process')
        self.process_definition.executable = {
            'module': 'ion.services.cei.test.test_process_dispatcher',
            'class': 'TestProcess'
        }
        self.process_definition_id = self.pd_cli.create_process_definition(
            self.process_definition)

        self.waiter = ProcessStateWaiter()

    def tearDown(self):
        self.waiter.stop()

    def test_create_schedule_cancel(self):
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS

        proc_name = 'myreallygoodname'
        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start(pid)

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
                                            process_schedule,
                                            configuration={},
                                            process_id=pid,
                                            name=proc_name)
        self.assertEqual(pid, pid2)

        # verifies L4-CI-CEI-RQ141 and L4-CI-CEI-RQ142
        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, {})
        self.assertEqual(proc.process_state, ProcessStateEnum.RUNNING)

        # make sure process is readable directly from RR (mirrored)
        # verifies L4-CI-CEI-RQ63
        # verifies L4-CI-CEI-RQ64
        proc = self.rr_cli.read(pid)
        self.assertEqual(proc.process_id, pid)

        # now try communicating with the process to make sure it is really running
        test_client = TestClient()
        for i in range(5):
            self.assertEqual(i + 1, test_client.count(timeout=10))

        # verifies L4-CI-CEI-RQ147

        # check the process name was set in container
        got_proc_name = test_client.get_process_name(pid=pid2)
        self.assertEqual(proc_name, got_proc_name)

        # kill the process and start it again
        self.pd_cli.cancel_process(pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
                                            process_schedule,
                                            configuration={},
                                            process_id=pid)
        self.assertEqual(pid, pid2)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        for i in range(5):
            self.assertEqual(i + 1, test_client.count(timeout=10))

        # kill the process for good
        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)

    def test_schedule_with_config(self):

        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start(pid)

        # verifies L4-CI-CEI-RQ66

        # feed in a string that the process will return -- verifies that
        # configuration actually makes it to the instantiated process
        test_response = uuid.uuid4().hex
        configuration = {"test_response": test_response}

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
                                            process_schedule,
                                            configuration=configuration,
                                            process_id=pid)
        self.assertEqual(pid, pid2)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        test_client = TestClient()

        # verifies L4-CI-CEI-RQ139
        # assure that configuration block (which can contain inputs, outputs,
        # and arbitrary config) 1) makes it to the process and 2) is returned
        # in process queries

        self.assertEqual(test_client.query(), test_response)

        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, configuration)

        # kill the process for good
        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)

    def test_schedule_bad_config(self):

        process_schedule = ProcessSchedule()

        # a non-JSON-serializable IonObject
        o = ProcessTarget()

        with self.assertRaises(BadRequest) as ar:
            self.pd_cli.schedule_process(self.process_definition_id,
                                         process_schedule,
                                         configuration={"bad": o})
        self.assertTrue(ar.exception.message.startswith("bad configuration"))

    def test_cancel_notfound(self):
        with self.assertRaises(NotFound):
            self.pd_cli.cancel_process("not-a-real-process-id")

    def test_create_invalid_definition(self):
        # create process definition missing module and class
        # verifies L4-CI-CEI-RQ137
        executable = dict(url="http://somewhere.com/something.py")
        definition = ProcessDefinition(name="test_process",
                                       executable=executable)
        with self.assertRaises(BadRequest):
            self.pd_cli.create_process_definition(definition)
class HAProcessControl(object):
    def __init__(self,
                 pd_name,
                 resource_registry,
                 service_id,
                 callback=None,
                 logprefix=""):
        self.pd_name = pd_name
        self.resource_registry = resource_registry
        self.service_id = service_id
        self.callback = callback
        if callback and not callable(callback):
            raise ValueError("callback is not callable")
        self.logprefix = logprefix

        self.client = ProcessDispatcherServiceClient(to_name=pd_name)
        self.event_sub = EventSubscriber(event_type="ProcessLifecycleEvent",
                                         callback=self._event_callback,
                                         origin_type="DispatchedProcess",
                                         auto_delete=True)

        self.processes = {}

    def start(self):
        service = self.resource_registry.read(self.service_id)
        process_assocs = self.resource_registry.find_associations(
            service, "hasProcess")

        for process_assoc in process_assocs:
            process_id = process_assoc.o
            if process_id:
                try:
                    process = self.client.read_process(process_id)
                except NotFound:
                    log.debug(
                        "%sService was associated with process %s, which is unknown to PD. ignoring.",
                        self.logprefix, process_id)
                    continue

                state = process.process_state
                state_str = ProcessStateEnum._str_map.get(state, str(state))

                self.processes[process.process_id] = _process_dict_from_object(
                    process)
                log.info("%srecovered process %s state=%s", self.logprefix,
                         process_id, state_str)
        self.event_sub.start()

    def stop(self):
        self.event_sub.stop()

    def get_managed_upids(self):
        return self.processes.keys()

    def _event_callback(self, event, *args, **kwargs):
        if not event:
            return

        try:
            self._inner_event_callback(event)
        except (KeyboardInterrupt, SystemExit):
            raise
        except:
            log.exception("%sException in event handler. This is a bug!",
                          self.logprefix)

    def _inner_event_callback(self, event):
        process_id = event.origin
        state = event.state
        state_str = ProcessStateEnum._str_map.get(state, str(state))
        if not (process_id and process_id in self.processes):
            # we receive events for all processes but ignore most
            return

        process = None
        for _ in range(3):
            try:
                process = self.client.read_process(process_id)
                break
            except Timeout:
                log.warn(
                    "Timeout trying to read process from Process Dispatcher!",
                    exc_info=True)
                pass  # retry
            except NotFound:
                break

        if process:
            log.info("%sreceived process %s state=%s", self.logprefix,
                     process_id, state_str)

            # replace the cached data about this process
            self.processes[process_id] = _process_dict_from_object(process)

        else:
            log.warn(
                "%sReceived process %s event but failed to read from Process Dispatcher",
                self.logprefix, process_id)

            #XXX better approach here? we at least have the state from the event,
            # so sticking that on cached process. We could miss other important
            # data like hostname however.
            self.processes[process_id]['state'] = process_state_to_pd_core(
                state)

        if self.callback:
            try:
                self.callback()
            except (KeyboardInterrupt, SystemExit):
                raise
            except:
                e = sys.exc_info()[0]
                log.warn("%sError in HAAgent callback: %s",
                         self.logprefix,
                         e,
                         exc_info=True)

    def _associate_process(self, process):
        try:
            self.resource_registry.create_association(self.service_id,
                                                      "hasProcess",
                                                      process.process_id)
        except Exception:
            log.exception("Couldn't associate service %s to process %s" %
                          (self.service_id, process.process_id))

    def schedule_process(self, pd_name, process_definition_id, **kwargs):

        if pd_name != self.pd_name:
            raise Exception(
                "schedule_process request received for unknown PD: %s" %
                pd_name)

        # figure out if there is an existing PID which can be reused
        found_upid = None
        for process in self.processes.values():
            upid = process.get('upid')
            state = process.get('state')
            if not (upid and state):
                continue

            if state in CoreProcessState.TERMINAL_STATES:
                found_upid = upid

        if found_upid:
            upid = found_upid
            proc = self.client.read_process(upid)

        else:
            # otherwise create a new process and associate
            upid = self.client.create_process(process_definition_id)

            # note: if the HAAgent fails between the create call above and the
            # associate call below, there may be orphaned Process objects. These
            # processes will not however be running, so are largely harmless.
            proc = self.client.read_process(upid)
            self._associate_process(proc)

        process_schedule = _get_process_schedule(**kwargs)
        configuration = kwargs.get('configuration')

        # cheat and roll the process state to REQUESTED before we actually
        # schedule it. this is in-memory only, so should be harmless. This
        # avoids a race between this scheduling process and the event
        # subscriber.
        proc.process_state = ProcessStateEnum.REQUESTED
        self.processes[upid] = _process_dict_from_object(proc)

        self.client.schedule_process(process_definition_id,
                                     process_schedule,
                                     configuration=configuration,
                                     process_id=upid)
        return upid

    def terminate_process(self, pid):
        return self.client.cancel_process(pid)

    def get_all_processes(self):
        processes = deepcopy(self.processes.values())
        return {self.pd_name: processes}

    def reload_processes(self):
        for process_id, process_dict in self.processes.items():
            try:
                process = self.client.read_process(process_id)
            except Exception:
                log.warn(
                    "%sFailed to read process %s from PD. Will retry later.",
                    self.logprefix,
                    process_id,
                    exc_info=True)
                continue
            new_process_dict = _process_dict_from_object(process)
            if new_process_dict['state'] != process_dict['state']:
                log.warn(
                    "%sUpdating process %s record manually. we may have missed an event?",
                    self.logprefix, process_id)
                self.processes[process_id] = new_process_dict
class HAProcessControl(object):

    def __init__(self, pd_name, resource_registry, service_id, callback=None, logprefix=""):
        self.pd_name = pd_name
        self.resource_registry = resource_registry
        self.service_id = service_id
        self.callback = callback
        if callback and not callable(callback):
            raise ValueError("callback is not callable")
        self.logprefix = logprefix

        self.client = ProcessDispatcherServiceClient(to_name=pd_name)
        self.event_sub = EventSubscriber(event_type="ProcessLifecycleEvent",
            callback=self._event_callback, origin_type="DispatchedProcess",
            auto_delete=True)

        self.processes = {}

    def start(self):
        service = self.resource_registry.read(self.service_id)
        process_assocs = self.resource_registry.find_associations(service, "hasProcess")

        for process_assoc in process_assocs:
            process_id = process_assoc.o
            if process_id:
                try:
                    process = self.client.read_process(process_id)
                except NotFound:
                    log.debug("%sService was associated with process %s, which is unknown to PD. ignoring.",
                        self.logprefix, process_id)
                    continue

                state = process.process_state
                state_str = ProcessStateEnum._str_map.get(state, str(state))

                self.processes[process.process_id] = _process_dict_from_object(process)
                log.info("%srecovered process %s state=%s", self.logprefix, process_id, state_str)
        self.event_sub.start()

    def stop(self):
        self.event_sub.stop()

    def get_managed_upids(self):
        return self.processes.keys()

    def _event_callback(self, event, *args, **kwargs):
        if not event:
            return

        try:
            self._inner_event_callback(event)
        except Exception:
            log.exception("%sException in event handler. This is a bug!", self.logprefix)

    def _inner_event_callback(self, event):
        process_id = event.origin
        state = event.state
        state_str = ProcessStateEnum._str_map.get(state, str(state))
        if not (process_id and process_id in self.processes):
            # we receive events for all processes but ignore most
            return

        process = None
        for _ in range(3):
            try:
                process = self.client.read_process(process_id)
                break
            except Timeout:
                log.warn("Timeout trying to read process from Process Dispatcher!", exc_info=True)
                pass  # retry
            except NotFound:
                break

        if process:
            log.info("%sreceived process %s state=%s", self.logprefix, process_id, state_str)

            # replace the cached data about this process
            self.processes[process_id] = _process_dict_from_object(process)

        else:
            log.warn("%sReceived process %s event but failed to read from Process Dispatcher",
                self.logprefix, process_id)

            #XXX better approach here? we at least have the state from the event,
            # so sticking that on cached process. We could miss other important
            # data like hostname however.
            self.processes[process_id]['state'] = process_state_to_pd_core(state)

        if self.callback:
            try:
                self.callback()
            except Exception, e:
                log.warn("%sError in HAAgent callback: %s", self.logprefix, e, exc_info=True)
Exemplo n.º 12
0
class Launcher(object):
    """
    Helper for launching platform and instrument agent processes.
    """
    def __init__(self, timeout_spawn):
        """
        @param timeout_spawn    Default timeout in secs for the RUNNING event.
        """
        self._timeout_spawn = timeout_spawn
        self._pd_client = ProcessDispatcherServiceClient()
        self._agent_launcher = AgentLauncher(self._pd_client)

    def destroy(self):
        if self._pd_client:
            self._pd_client.close()
            self._pd_client = None
            self._agent_launcher = None

    def launch_platform(self, agt_id, agent_config, timeout_spawn=None):
        """
        Launches a platform agent.

        @param agt_id           Some ID mainly used for logging
        @param agent_config     Agent configuration
        @param timeout_spawn    Timeout in secs for the RUNNING event (by
                                default, the value given in constructor).
                                If None or zero, no wait is performed.

        @return process ID
        """
        timeout_spawn = timeout_spawn or self._timeout_spawn
        log.debug("launch_platform: agt_id=%r, timeout_spawn=%s", agt_id,
                  timeout_spawn)

        name = 'PlatformAgent_%s' % agt_id
        pdef = ProcessDefinition(name=name)
        pdef.executable = {
            'module': 'ion.agents.platform.platform_agent',
            'class': 'PlatformAgent'
        }

        pdef_id = self._pd_client.create_process_definition(
            process_definition=pdef)

        pid = self._agent_launcher.launch(agent_config, pdef_id)

        if timeout_spawn:
            log.debug("launch_platform: agt_id=%r: waiting for RUNNING",
                      agt_id)
            self._agent_launcher.await_launch(timeout_spawn)
            log.debug("launch_platform: agt_id=%r: RUNNING", agt_id)

        return pid

    def launch_instrument(self, agt_id, agent_config, timeout_spawn=None):
        """
        Launches an instrument agent.

        @param agt_id           Some ID mainly used for logging
        @param agent_config     Agent configuration
        @param timeout_spawn    Timeout in secs for the RUNNING event (by
                                default, the value given in constructor).
                                If None or zero, no wait is performed.

        @return process ID
        """
        timeout_spawn = timeout_spawn or self._timeout_spawn
        log.debug("launch_instrument: agt_id=%r, timeout_spawn=%s", agt_id,
                  timeout_spawn)

        name = 'InstrumentAgent_%s' % agt_id
        pdef = ProcessDefinition(name=name)
        pdef.executable = {
            'module': 'ion.agents.instrument.instrument_agent',
            'class': 'InstrumentAgent'
        }

        pdef_id = self._pd_client.create_process_definition(
            process_definition=pdef)

        pid = self._agent_launcher.launch(agent_config, pdef_id)

        if timeout_spawn:
            log.debug("launch_instrument: agt_id=%r: waiting for RUNNING",
                      agt_id)
            self._agent_launcher.await_launch(timeout_spawn)
            log.debug("launch_instrument: agt_id=%r: RUNNING", agt_id)

        return pid

    def cancel_process(self, pid, timeout_cancel=None):
        """
        Helper to terminate a process
        """
        pinfo = self._pd_client.read_process(pid)
        if pinfo.process_state != ProcessStateEnum.RUNNING:
            log.debug("cancel_process: pid=%r is not RUNNING", pid)
            return

        log.debug("cancel_process: canceling pid=%r", pid)
        self._pd_client.cancel_process(pid)

        if timeout_cancel:
            log.debug("waiting %s seconds for preocess to cancel",
                      timeout_cancel)
            psg = ProcessStateGate(self._pd_client.read_process, pid,
                                   ProcessStateEnum.TERMINATED)
            if not psg. await (timeout_cancel):
                log.debug(
                    "Process %r failed to get to TERMINATED in %s seconds",
                    pid, timeout_cancel)