class ProcessDispatcherSimpleAPIClient(object):

    # State to use when state returned from PD is None
    unknown_state = "400-PENDING"

    state_map = {
        ProcessStateEnum.SPAWN: '500-RUNNING',
        ProcessStateEnum.TERMINATE: '700-TERMINATED',
        ProcessStateEnum.ERROR: '850-FAILED'
    }

    def __init__(self, name, **kwargs):
        self.real_client = ProcessDispatcherServiceClient(to_name=name, **kwargs)
        self.event_pub = EventPublisher()

    def dispatch_process(self, upid, spec, subscribers, constraints=None,
                         immediate=False):

        name = spec.get('name')
        self.event_pub.publish_event(event_type="ProcessLifecycleEvent",
            origin=name, origin_type="DispatchedHAProcess",
            state=ProcessStateEnum.SPAWN)
        process_def = ProcessDefinition(name=name)
        process_def.executable = {'module': spec.get('module'),
                'class': spec.get('class')}

        process_def_id = self.real_client.create_process_definition(process_def)

        pid = self.real_client.create_process(process_def_id)

        process_schedule = ProcessSchedule()

        sched_pid = self.real_client.schedule_process(process_def_id,
                process_schedule, configuration={}, process_id=pid)

        proc = self.real_client.read_process(sched_pid)
        dict_proc = {'upid': proc.process_id,
                'state': self.state_map.get(proc.process_state, self.unknown_state),
                }
        return dict_proc

    def terminate_process(self, pid):
        return self.real_client.cancel_process(pid)

    def describe_processes(self):
        procs = self.real_client.list_processes()
        dict_procs = []
        for proc in procs:
            dict_proc = {'upid': proc.process_id,
                    'state': self.state_map.get(proc.process_state, self.unknown_state),
                    }
            dict_procs.append(dict_proc)
        return dict_procs
示例#2
0
class BaseHighAvailabilityAgentTest(IonIntegrationTestCase):

    @needs_epu
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        #self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition_name = 'test_haagent_%s' % self.process_definition_id
        self.process_definition = ProcessDefinition(name=self.process_definition_name, executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
        })
        self.pd_cli.create_process_definition(self.process_definition, self.process_definition_id)

        service_definition = SERVICE_DEFINITION_TMPL % self.process_definition_name
        sd = IonObject(RT.ServiceDefinition, {"name": self.process_definition_name,
            "definition": service_definition})
        self.service_def_id, _ = self.container.resource_registry.create(sd)

        self.resource_id = "haagent_1234"
        self._haa_name = "high_availability_agent"
        self._haa_dashi_name = "dashi_haa_" + uuid4().hex
        self._haa_dashi_uri = get_dashi_uri_from_cfg()
        self._haa_dashi_exchange = "hatests"
        self._haa_config = self._get_haagent_config()

        self._base_services, _ = self.container.resource_registry.find_resources(
                restype="Service", name=self.process_definition_name)

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._spawn_haagent()

        self._setup_haa_client()

    def _get_haagent_config(self):
        return {
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'npreserving',
                    'parameters': {
                        'preserve_n': 0
                    }
                },
                'process_definition_id': self.process_definition_id,
                'dashi_messaging': True,
                'dashi_exchange': self._haa_dashi_exchange,
                'dashi_name': self._haa_dashi_name
            },
            'agent': {'resource_id': self.resource_id},
        }

    def _setup_haa_client(self):
        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)

    def _spawn_haagent(self, policy_parameters=None):

        config = deepcopy(self._haa_config)
        if policy_parameters is not None:
            config['highavailability']['policy']['parameters'] = policy_parameters
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=config)

    def _kill_haagent(self):
        self.container.terminate_process(self._haa_pid)

    def tearDown(self):


        new_policy = {'preserve_n': 0}
        self.haa_client.reconfigure_policy(new_policy)

        self.assertEqual(len(self.get_running_procs()), 0)
        self.await_ha_state('STEADY')

        self.waiter.stop()
        try:
            self._kill_haagent()
        except BadRequest:
            log.warning("Couldn't terminate HA Agent in teardown (May have been terminated by a test)")
        self.container.resource_registry.delete(self.service_def_id, del_associations=True)
        self._stop_container()

    def get_running_procs(self):
        """returns a normalized set of running procs (removes the ones that
        were there at setup time)
        """

        base = self._base_procs
        base_pids = [proc.process_id for proc in base]
        current = self.pd_cli.list_processes()
        current_pids = [proc.process_id for proc in current]
        print "filtering base procs %s from %s" % (base_pids, current_pids)
        normal = [cproc for cproc in current if cproc.process_id not in base_pids and cproc.process_state == ProcessStateEnum.RUNNING]
        return normal

    def get_new_services(self):

        base = self._base_services
        base_names = [i.name for i in base]
        services_registered, _ = self.container.resource_registry.find_resources(
                restype="Service", name=self.process_definition_name)
        normal = [cserv for cserv in services_registered if cserv.name not in base_names]
        return normal

    def await_ha_state(self, want_state, timeout=20):

        for i in range(0, timeout):
            try:
                status = self.haa_client.status().result
                if status == want_state:
                    return
                else:
                    procs = self.get_running_procs()
                    num_procs = len(procs)
                    log.debug("assert wants state %s, got state %s, with %s procs" % (want_state,status, num_procs))
            except Exception:
                log.exception("Problem getting HA status, trying again...")
                gevent.sleep(1)

        raise Exception("Took more than %s to get to ha state %s" % (timeout, want_state))

    def await_pyon_ha_state(self, want_state, timeout=20):
        for i in range(0, timeout):
            try:
                result = self.haa_client.dump().result
                service_id = result.get('service_id')
                service = self.container.resource_registry.read(service_id)

                if service.state == want_state:
                    return
                else:
                    log.debug("want state %s, got state %s") % (want_state, service.state)

            except Exception:
                log.exception("Problem getting HA status, trying again...")
                gevent.sleep(1)

        raise Exception("Took more than %s to get to pyon ha state %s" % (timeout, want_state))
示例#3
0
class HighAvailabilityAgentSensorPolicyTest(IonIntegrationTestCase):

    def _start_webserver(self, port=None):
        """ Start a webserver for testing code download
        Note: tries really hard to get a port, and if it can't use
        the suggested port, randomly picks another, and returns it
        """
        def log_message(self, format, *args):
            #swallow log massages
            pass

        class TestRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
            server_version = 'test_server'
            extensions_map = ''

            def do_GET(self):
                self.send_response(200)
                self.send_header("Content-type", "text/plain")
                self.send_header("Content-Length", len(self.server.response))
                self.end_headers()
                self.wfile.write(self.server.response)

        class Server(HTTPServer):

            response = ''

            def serve_forever(self):
                self._serving = 1
                while self._serving:
                    self.handle_request()

            def stop(self):
                self._serving = 0

        if port is None:
            port = 8008
        Handler = TestRequestHandler
        Handler.log_message = log_message

        for i in range(0, 100):
            try:
                self._webserver = Server(("localhost", port), Handler)
            except socket.error:
                print "port %s is in use, picking another" % port
                port = randint(8000, 10000)
                continue
            else:
                break

        self._web_glet = gevent.spawn(self._webserver.serve_forever)
        return port

    def _stop_webserver(self):
        if self._webserver is not None:
            self._webserver.stop()
            gevent.sleep(2)
            self._web_glet.kill()

    def await_ha_state(self, want_state, timeout=20):

        for i in range(0, timeout):
            try:
                status = self.haa_client.status().result
                if status == want_state:
                    return
                else:
                    procs = self.get_running_procs()
                    num_procs = len(procs)
                    log.debug("assert wants state %s, got state %s, with %s procs" % (want_state,status, num_procs))
            except Exception:
                log.exception("Problem getting HA status, trying again...")
                gevent.sleep(1)

        raise Exception("Took more than %s to get to ha state %s" % (timeout, want_state))

    @needs_epu
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition = ProcessDefinition(name='test', executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
        })

        self.pd_cli.create_process_definition(self.process_definition,
                self.process_definition_id)

        http_port = 8919
        http_port = self._start_webserver(port=http_port)

        self.resource_id = "haagent_4567"
        self._haa_name = "high_availability_agent"
        self._haa_config = {
            'server': {
                'trafficsentinel': {
                    'host': 'localhost',
                    'port': http_port,
                    'protocol': 'http',
                    'username': '******',
                    'password': '******'
                }
            },
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'sensor',
                    'parameters': {
                        'metric': 'app_attributes:ml',
                        'sample_period': 600,
                        'sample_function': 'Average',
                        'cooldown_period': 5,
                        'scale_up_threshold': 2.0,
                        'scale_up_n_processes': 1,
                        'scale_down_threshold': 1.0,
                        'scale_down_n_processes': 1,
                        'maximum_processes': 5,
                        'minimum_processes': 1,
                    }
                },
                'process_definition_id': self.process_definition_id,
                "process_dispatchers": [
                    'process_dispatcher'
                ]
            },
            'agent': {'resource_id': self.resource_id},
        }

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=self._haa_config)

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info('Got haa client %s.', str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)

    def tearDown(self):
        new_policy = { 'metric': 'app_attributes:ml',
                        'sample_period': 600,
                        'sample_function': 'Average',
                        'cooldown_period': 0,
                        'scale_up_threshold': 2.0,
                        'scale_up_n_processes': 1,
                        'scale_down_threshold': 1.0,
                        'scale_down_n_processes': 1,
                        'maximum_processes': 0,
                        'minimum_processes': 0,
                    }
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)
        self.assertEqual(len(self.get_running_procs()), 0)

        self.waiter.stop()
        self.container.terminate_process(self._haa_pid)
        self._stop_webserver()
        self._stop_container()

    def get_running_procs(self):
        """returns a normalized set of running procs (removes the ones that
        were there at setup time)
        """

        base = self._base_procs
        base_pids = [proc.process_id for proc in base]
        current = self.pd_cli.list_processes()
        current_pids = [proc.process_id for proc in current]
        print "filtering base procs %s from %s" % (base_pids, current_pids)
        normal = [cproc for cproc in current if cproc.process_id not in base_pids and cproc.process_state == ProcessStateEnum.RUNNING]
        return normal

    def _get_managed_upids(self):
        result = self.haa_client.dump().result
        upids = result['managed_upids']
        return upids

    def _set_response(self, response):
        self._webserver.response = response

    def test_sensor_policy(self):
        status = self.haa_client.status().result
        # Ensure HA hasn't already failed
        assert status in ('PENDING', 'READY', 'STEADY')

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 1)

        self.await_ha_state('STEADY')

        # Set ml for each proc such that we scale up
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "pid=%s&ml=5\n" % upid
        self._set_response(response)

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 2)

        # Set ml so we stay steady
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "pid=%s&ml=1.5\n" % upid
        self._set_response(response)

        self.assertEqual(len(self.get_running_procs()), 2)

        self.await_ha_state('STEADY')

        # Set ml so we scale down
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "pid=%s&ml=0.5\n" % upid
        self._set_response(response)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)

        self.assertEqual(len(self.get_running_procs()), 1)

        self.await_ha_state('STEADY')
示例#4
0
class ProcessLeak(Plugin):
    """
    Plugin to detect runaway processes in tests on the server side. It is intended to work against a live process
    dispatcher service during the entire nose run.  Therefore, it works in conjunction with --with-pycc plugin
    or a light CEI or full launch.  Use greenleak plugin instead for leak detection in non-pycc mode.
    """
    name = "processleak"

    def __init__(self):
        Plugin.__init__(self)

    def options(self, parser, env):
        super(ProcessLeak, self).options(parser, env=env)

        parser.add_option("--list-procs", action="store_true", dest="list_procs", \
                help="List all new procs created during the test")
    def configure(self, options, conf):
        super(ProcessLeak, self).configure(options, conf)

        self._list_procs = options.list_procs

    def begin(self):
        from interface.services.cei.iprocess_dispatcher_service import ProcessDispatcherServiceClient
        from pyon.net.messaging import make_node
        from pyon.core import bootstrap
        from pyon.public import CFG

        self.rpc_timeout = 2
        self.base_pids = []
        self._procs_by_test = {}
        if not bootstrap.pyon_initialized:
            bootstrap.bootstrap_pyon()
        self.node, self.ioloop = make_node()
        self.node.setup_interceptors(CFG.interceptor)
        self.pd_cli =  ProcessDispatcherServiceClient(node=self.node)
        # Set base_pids once
        from pyon.core.exception import Timeout
        try:
           self.base_pids = [ proc.process_id for proc in self.pd_cli.list_processes(timeout=20) ]
        except Timeout:
           pass

    def beforeTest(self, test):
        from pyon.core.exception import Timeout
        #Only populate self.base_pids if it' still empty
        if not self.base_pids:
            try:
                self.base_pids = [ proc.process_id for proc in self.pd_cli.list_processes(timeout=self.rpc_timeout) ]
            except Timeout:
                pass

    def afterTest(self, test):
        from pyon.core.exception import Timeout
        from interface.objects import ProcessStateEnum
        try:
            current = self.pd_cli.list_processes(timeout=self.rpc_timeout)
            from interface.objects import ProcessStateEnum
            procs_leaked = [ (proc.process_id, ProcessStateEnum._str_map.get(proc.process_state)) for proc in current \
                if proc.process_id not in self.base_pids and proc.process_state \
                not in [ProcessStateEnum.TERMINATED, ProcessStateEnum.EXITED] ]
            if self._list_procs:
                all_procs = [ (proc.process_id, ProcessStateEnum._str_map.get(proc.process_state)) for proc in current \
                if proc.process_id not in self.base_pids ]
            if len(procs_leaked) > 0:
                [ self.pd_cli.cancel_process(proc[0], timeout=self.rpc_timeout) for proc in procs_leaked ]
                if self._list_procs:
                    self._procs_by_test[test.id()] = all_procs
                else:
                    self._procs_by_test[test.id()] = procs_leaked
        except Timeout:
            pass

    def report(self, stream):
        table = []

        for tid, procs in self._procs_by_test.iteritems():

            # printable tid: don't need the full path
            ptid = ".".join(tid.split(".")[-2:])

            table.append([ptid, ""])
            for proc in procs:
                table.append(["", str(proc)])

            table.append(["", ""])

        # header
        table.insert(0, ['Test', 'Leaked Processes'])

        # get widths
        widths = [max([len(row[x]) for row in table]) for x in xrange(len(table[0]))]
        fmt_out = [" ".join([x.ljust(widths[i]) for i, x in enumerate(row)]) for row in table]

        # insert col separation row
        fmt_out.insert(1, " ".join([''.ljust(widths[i], '=') for i in xrange(len(widths))]))

        # write this all to sstream
        stream.write("Process leak report\n")

        stream.write("\n".join(fmt_out))
        stream.write("\n")

    def finalize(self, result):
        self.node.stop_node()
        self.ioloop.join()
示例#5
0
class ProcessLeak(Plugin):
    """
    Plugin to detect runaway processes in tests on the server side. It is intended to work against a live process
    dispatcher service during the entire nose run.  Therefore, it works in conjunction with --with-pycc plugin
    or a light CEI or full launch.  Use greenleak plugin instead for leak detection in non-pycc mode.
    """
    name = "processleak"

    def __init__(self):
        Plugin.__init__(self)

    def options(self, parser, env):
        super(ProcessLeak, self).options(parser, env=env)

        parser.add_option("--list-procs", action="store_true", dest="list_procs", \
                help="List all new procs created during the test")

    def configure(self, options, conf):
        super(ProcessLeak, self).configure(options, conf)

        self._list_procs = options.list_procs

    def begin(self):
        from interface.services.cei.iprocess_dispatcher_service import ProcessDispatcherServiceClient
        from pyon.net.messaging import make_node
        from pyon.core import bootstrap
        from pyon.public import CFG

        self.base_pids = []
        self.rpc_timeout = 2
        self._procs_by_test = {}
        if not bootstrap.pyon_initialized:
            bootstrap.bootstrap_pyon()
        self.node, self.ioloop = make_node()
        self.node.setup_interceptors(CFG.interceptor)
        self.pd_cli = ProcessDispatcherServiceClient(node=self.node)

    def beforeTest(self, test):
        from pyon.core.exception import Timeout
        try:
            self.base_pids = [
                proc.process_id for proc in self.pd_cli.list_processes(
                    timeout=self.rpc_timeout)
            ]
        except Timeout:
            pass

    def afterTest(self, test):
        from pyon.core.exception import Timeout
        from interface.objects import ProcessStateEnum
        try:
            current = self.pd_cli.list_processes(timeout=self.rpc_timeout)
            from interface.objects import ProcessStateEnum
            procs_leaked = [ (proc.process_id, ProcessStateEnum._str_map.get(proc.process_state)) for proc in current \
                if proc.process_id not in self.base_pids and proc.process_state \
                not in [ProcessStateEnum.TERMINATED, ProcessStateEnum.EXITED] ]
            if self._list_procs:
                all_procs = [ (proc.process_id, ProcessStateEnum._str_map.get(proc.process_state)) for proc in current \
                if proc.process_id not in self.base_pids ]
            if len(procs_leaked) > 0:
                [
                    self.pd_cli.cancel_process(proc[0],
                                               timeout=self.rpc_timeout)
                    for proc in procs_leaked
                ]
                if self._list_procs:
                    self._procs_by_test[test.id()] = all_procs
                else:
                    self._procs_by_test[test.id()] = procs_leaked
        except Timeout:
            pass

    def report(self, stream):
        table = []

        for tid, procs in self._procs_by_test.iteritems():

            # printable tid: don't need the full path
            ptid = ".".join(tid.split(".")[-2:])

            table.append([ptid, ""])
            for proc in procs:
                table.append(["", str(proc)])

            table.append(["", ""])

        # header
        table.insert(0, ['Test', 'Leaked Processes'])

        # get widths
        widths = [
            max([len(row[x]) for row in table]) for x in xrange(len(table[0]))
        ]
        fmt_out = [
            " ".join([x.ljust(widths[i]) for i, x in enumerate(row)])
            for row in table
        ]

        # insert col separation row
        fmt_out.insert(
            1,
            " ".join([''.ljust(widths[i], '=') for i in xrange(len(widths))]))

        # write this all to sstream
        stream.write("Process leak report\n")

        stream.write("\n".join(fmt_out))
        stream.write("\n")

    def finalize(self, result):
        self.node.stop_node()
        self.ioloop.join()
示例#6
0
class HighAvailabilityAgentTest(IonIntegrationTestCase):

    @needs_epu
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        #self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition_name = 'test'
        self.process_definition =  ProcessDefinition(name=self.process_definition_name, executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
        })
        self.pd_cli.create_process_definition(self.process_definition, self.process_definition_id)

        self.resource_id = "haagent_1234"
        self._haa_name = "high_availability_agent"
        self._haa_dashi_name = "dashi_haa_" + uuid4().hex
        self._haa_dashi_uri = get_dashi_uri_from_cfg()
        self._haa_dashi_exchange = "%s.hatests" % bootstrap.get_sys_name()
        self._haa_config = {
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'npreserving',
                    'parameters': {
                        'preserve_n': 0
                    }
                },
                'process_definition_id': self.process_definition_id,
                'dashi_messaging' : True,
                'dashi_exchange' : self._haa_dashi_exchange,
                'dashi_name': self._haa_dashi_name
            },
            'agent': {'resource_id': self.resource_id},
        }

        self._base_services, _ = self.container.resource_registry.find_resources(
                restype="Service", name=self.process_definition_name)

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=self._haa_config)

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info('Got haa client %s.', str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)


    def tearDown(self):
        self.waiter.stop()
        try:
            self.container.terminate_process(self._haa_pid)
        except BadRequest:
            log.warning("Couldn't terminate HA Agent in teardown (May have been terminated by a test)")
        self._stop_container()

    def get_running_procs(self):
        """returns a normalized set of running procs (removes the ones that
        were there at setup time)
        """

        base = self._base_procs
        base_pids = [proc.process_id for proc in base]
        current = self.pd_cli.list_processes()
        current_pids = [proc.process_id for proc in current]
        print "filtering base procs %s from %s" % (base_pids, current_pids)
        normal = [cproc for cproc in current if cproc.process_id not in base_pids and cproc.process_state == ProcessStateEnum.RUNNING]
        return normal

    def get_new_services(self):

        base = self._base_services
        base_names = [i.name for i in base]
        services_registered, _ = self.container.resource_registry.find_resources(
                restype="Service", name=self.process_definition_name)
        current_names = [i.name for i in services_registered]
        normal = [cserv for cserv in services_registered if cserv.name not in base_names]
        return normal

    def await_ha_state(self, want_state, timeout=10):

        for i in range(0, timeout):
            status = self.haa_client.status().result
            if status == want_state:
                return
            gevent.sleep(1)

        raise Exception("Took more than %s to get to ha state %s" % (timeout, want_state))


    def test_features(self):
        status = self.haa_client.status().result
        # Ensure HA hasn't already failed
        assert status in ('PENDING', 'READY', 'STEADY')


        # verifies L4-CI-CEI-RQ44
        # Note: the HA agent is started in the setUp() method, with config
        # pointing to the test "service". The initial config is set to preserve
        # 0 service processes. With this reconfigure step below, we change that
        # to launch 1.

        new_policy = {'preserve_n': 1}
        self.haa_client.reconfigure_policy(new_policy)

        result = self.haa_client.dump().result
        self.assertEqual(result['policy'], new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 1)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, 'STEADY')
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"

        # verifies L4-CI-CEI-RQ122 and L4-CI-CEI-RQ124

        new_policy = {'preserve_n': 2}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 2)

        new_policy = {'preserve_n': 1}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)

        self.assertEqual(len(self.get_running_procs()), 1)

        new_policy = {'preserve_n': 0}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)
        self.assertEqual(len(self.get_running_procs()), 0)

    def test_associations(self):

        # Ensure that once the HA Agent starts, there is a Service object in
        # the registry
        result = self.haa_client.dump().result
        service_id = result.get('service_id')
        self.assertIsNotNone(service_id)
        service = self.container.resource_registry.read(service_id)
        self.assertIsNotNone(service)

        # Ensure that once a process is started, there is an association between
        # it and the service
        new_policy = {'preserve_n': 1}
        self.haa_client.reconfigure_policy(new_policy)
        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)
        self.assertEqual(len(self.get_running_procs()), 1)

        self.await_ha_state('STEADY')

        proc = self.get_running_procs()[0]

        processes_associated, _ = self.container.resource_registry.find_resources(
                restype="Process", name=proc.process_id)
        self.assertEqual(len(processes_associated), 1)

        has_processes = self.container.resource_registry.find_associations(
            service, "hasProcess")
        self.assertEqual(len(has_processes), 1)

        self.await_ha_state('STEADY')

        # Ensure that once we terminate that process, there are no associations
        new_policy = {'preserve_n': 0}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)
        self.assertEqual(len(self.get_running_procs()), 0)

        processes_associated, _ = self.container.resource_registry.find_resources(
                restype="Process", name=proc.process_id)
        self.assertEqual(len(processes_associated), 0)

        has_processes = self.container.resource_registry.find_associations(
            service, "hasProcess")
        self.assertEqual(len(has_processes), 0)

        # Ensure that once we terminate that HA Agent, the Service object is
        # cleaned up
        self.container.terminate_process(self._haa_pid)

        with self.assertRaises(NotFound):
            service = self.container.resource_registry.read(service_id)

    def test_dashi(self):

        import dashi

        dashi_conn = dashi.DashiConnection("something", self._haa_dashi_uri,
            self._haa_dashi_exchange)

        status = dashi_conn.call(self._haa_dashi_name, "status")
        assert status in ('PENDING', 'READY', 'STEADY')

        new_policy = {'preserve_n': 0}
        dashi_conn.call(self._haa_dashi_name, "reconfigure_policy",
            new_policy=new_policy)
示例#7
0
class HighAvailabilityAgentSensorPolicyTest(IonIntegrationTestCase):

    def _start_webserver(self, port=None):
        """ Start a webserver for testing code download
        Note: tries really hard to get a port, and if it can't use
        the suggested port, randomly picks another, and returns it
        """
        def log_message(self, format, *args):
            #swallow log massages
            pass

        class TestRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
            server_version = 'test_server'
            extensions_map = ''
            def do_GET(self):
                self.send_response(200)
                self.send_header("Content-type", "text/plain")
                self.send_header("Content-Length", len(self.server.response))
                self.end_headers()
                self.wfile.write(self.server.response)


        class Server(HTTPServer):

            response = ''

            def serve_forever(self):
                self._serving = 1
                while self._serving:
                    self.handle_request()

            def stop(self):
                self._serving = 0

        if port is None:
            port = 8008
        Handler = TestRequestHandler
        Handler.log_message = log_message

        for i in range(0, 100):
            try:
                self._webserver = Server(("localhost", port), Handler)
            except socket.error:
                print "port %s is in use, picking another" % port
                port = randint(8000, 10000)
                continue
            else:
                break

        self._web_glet = gevent.spawn(self._webserver.serve_forever)
        return port

    def _stop_webserver(self):
        if self._webserver is not None:
            self._webserver.stop()
            gevent.sleep(2)
            self._web_glet.kill()

    @needs_epu
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition =  ProcessDefinition(name='test', executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
        })

        self.pd_cli.create_process_definition(self.process_definition,
                self.process_definition_id)


        http_port = 8919
        http_port = self._start_webserver(port=http_port)

        self.resource_id = "haagent_4567"
        self._haa_name = "high_availability_agent"
        self._haa_config = {
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'sensor',
                    'parameters': {
                        'metric': 'app_attributes:ml',
                        'sample_period': 600,
                        'sample_function': 'Average',
                        'cooldown_period': 20,
                        'scale_up_threshold': 2.0,
                        'scale_up_n_processes': 1,
                        'scale_down_threshold': 1.0,
                        'scale_down_n_processes': 1,
                        'maximum_processes': 5,
                        'minimum_processes': 1,
                    }
                },
                'aggregator': {
                    'type': 'trafficsentinel',
                    'host': 'localhost',
                    'port': http_port,
                    'protocol': 'http',
                    'username': '******',
                    'password': '******'
                },
                'process_definition_id': self.process_definition_id,
                "process_dispatchers": [
                    'process_dispatcher'
                ]
            },
            'agent': {'resource_id': self.resource_id},
        }


        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=self._haa_config)

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info('Got haa client %s.', str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)


    def tearDown(self):
        self.waiter.stop()
        self.container.terminate_process(self._haa_pid)
        self._stop_webserver()
        self._stop_container()

    def get_running_procs(self):
        """returns a normalized set of running procs (removes the ones that
        were there at setup time)
        """

        base = self._base_procs
        base_pids = [proc.process_id for proc in base]
        current = self.pd_cli.list_processes()
        current_pids = [proc.process_id for proc in current]
        print "filtering base procs %s from %s" % (base_pids, current_pids)
        normal = [cproc for cproc in current if cproc.process_id not in base_pids and cproc.process_state == ProcessStateEnum.RUNNING]
        return normal

    def _get_managed_upids(self):
        result = self.haa_client.dump().result
        upids = result['managed_upids']
        return upids

    def _set_response(self, response):
        self._webserver.response = response

    def test_sensor_policy(self):
        status = self.haa_client.status().result
        # Ensure HA hasn't already failed
        assert status in ('PENDING', 'READY', 'STEADY')

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 1)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, 'STEADY')
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"

        # Set ml for each proc such that we scale up
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "%s,ml=5\n"
        self._set_response(response)

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 2)

        # Set ml so we stay steady
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "%s,ml=1.5\n"
        self._set_response(response)

        self.assertEqual(len(self.get_running_procs()), 2)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, 'STEADY')
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"

        # Set ml so we scale down
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "%s,ml=0.5\n"
        self._set_response(response)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)

        self.assertEqual(len(self.get_running_procs()), 1)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, 'STEADY')
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"
示例#8
0
class HighAvailabilityAgentTest(IonIntegrationTestCase):

    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        #self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.resource_id = "haagent_1234"
        self._haa_name = "high_availability_agent"
        self._haa_config = {
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'npreserving',
                    'parameters': {
                        'preserve_n': 0
                    }
                },
                'process_spec': {
                    'name': 'test',
                    'module': 'ion.agents.cei.test.test_haagent',
                    'class': 'TestProcess'
                },
                "process_dispatchers": [
                    'process_dispatcher'
                ]
            },
            'agent': {'resource_id': self.resource_id},
        }

        self._base_procs = self.pd_cli.list_processes()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=self._haa_config)

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = ResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info('Got haa client %s.', str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)

        self.event_queue = queue.Queue()
        self.event_sub = None

    def tearDown(self):
        self.container.terminate_process(self._haa_pid)
        self._stop_container()

    def _event_callback(self, event, *args, **kwargs):
        self.event_queue.put(event)

    def subscribe_events(self, origin):
        self.event_sub = EventSubscriber(event_type="ProcessLifecycleEvent",
            callback=self._event_callback, origin_type="DispatchedProcess")
        self.event_sub.start()

    def await_state_event(self, pid, state):
        event = self.event_queue.get(timeout=10)
        log.debug("Got event: %s", event)
        self.assertTrue(event.origin.startswith(pid))
        self.assertEqual(event.state, state)
        return event

    def get_procs(self):
        """returns a normalized set of procs (removes the ones that were there
        at setup time)
        """

        base = self._base_procs
        current = self.pd_cli.list_processes()
        normal = [proc for proc in current if proc not in base]
        return normal

    @needs_epu
    def test_features(self):
        status = self.haa_client.status().result
        self.assertEqual(status, 'PENDING')

        new_policy = {'preserve_n': 1}
        self.haa_client.reconfigure_policy(new_policy)

        result = self.haa_client.dump().result
        self.assertEqual(result['policy'], new_policy)

        self.subscribe_events(None)
        self.await_state_event("test", ProcessStateEnum.SPAWN)

        self.assertEqual(len(self.get_procs()), 1)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, 'STEADY')
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"

        new_policy = {'preserve_n': 2}
        self.haa_client.reconfigure_policy(new_policy)

        self.await_state_event("test", ProcessStateEnum.SPAWN)

        self.assertEqual(len(self.get_procs()), 2)

        new_policy = {'preserve_n': 1}
        self.haa_client.reconfigure_policy(new_policy)

        self.await_state_event("test", ProcessStateEnum.TERMINATE)
        self.assertEqual(len(self.get_procs()), 1)

        new_policy = {'preserve_n': 0}
        self.haa_client.reconfigure_policy(new_policy)

        self.await_state_event("test", ProcessStateEnum.TERMINATE)
        self.assertEqual(len(self.get_procs()), 0)
示例#9
0
class HighAvailabilityAgentTest(IonIntegrationTestCase):

    @needs_epu
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        #self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition_name = 'test'
        self.process_definition =  ProcessDefinition(name=self.process_definition_name, executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
        })
        self.pd_cli.create_process_definition(self.process_definition, self.process_definition_id)

        self.resource_id = "haagent_1234"
        self._haa_name = "high_availability_agent"
        self._haa_dashi_name = "dashi_haa_" + uuid4().hex
        self._haa_dashi_uri = get_dashi_uri_from_cfg()
        self._haa_dashi_exchange = "%s.hatests" % bootstrap.get_sys_name()
        self._haa_config = {
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'npreserving',
                    'parameters': {
                        'preserve_n': 0
                    }
                },
                'process_definition_id': self.process_definition_id,
                'dashi_messaging' : True,
                'dashi_exchange' : self._haa_dashi_exchange,
                'dashi_name': self._haa_dashi_name
            },
            'agent': {'resource_id': self.resource_id},
        }

        self._base_services, _ = self.container.resource_registry.find_resources(
                restype="Service", name=self.process_definition_name)

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=self._haa_config)

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info('Got haa client %s.', str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)


    def tearDown(self):
        self.waiter.stop()
        try:
            self.container.terminate_process(self._haa_pid)
        except BadRequest:
            log.warning("Couldn't terminate HA Agent in teardown (May have been terminated by a test)")
        self._stop_container()

    def get_running_procs(self):
        """returns a normalized set of running procs (removes the ones that
        were there at setup time)
        """

        base = self._base_procs
        base_pids = [proc.process_id for proc in base]
        current = self.pd_cli.list_processes()
        current_pids = [proc.process_id for proc in current]
        print "filtering base procs %s from %s" % (base_pids, current_pids)
        normal = [cproc for cproc in current if cproc.process_id not in base_pids and cproc.process_state == ProcessStateEnum.RUNNING]
        return normal

    def get_new_services(self):

        base = self._base_services
        base_names = [i.name for i in base]
        services_registered, _ = self.container.resource_registry.find_resources(
                restype="Service", name=self.process_definition_name)
        current_names = [i.name for i in services_registered]
        normal = [cserv for cserv in services_registered if cserv.name not in base_names]
        return normal

    def await_ha_state(self, want_state, timeout=10):

        for i in range(0, timeout):
            status = self.haa_client.status().result
            if status == want_state:
                return
            gevent.sleep(1)

        raise Exception("Took more than %s to get to ha state %s" % (timeout, want_state))


    def test_features(self):
        status = self.haa_client.status().result
        # Ensure HA hasn't already failed
        assert status in ('PENDING', 'READY', 'STEADY')


        # verifies L4-CI-CEI-RQ44
        # Note: the HA agent is started in the setUp() method, with config
        # pointing to the test "service". The initial config is set to preserve
        # 0 service processes. With this reconfigure step below, we change that
        # to launch 1.

        new_policy = {'preserve_n': 1}
        self.haa_client.reconfigure_policy(new_policy)

        result = self.haa_client.dump().result
        self.assertEqual(result['policy'], new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 1)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, 'STEADY')
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"

        # Ensure Service object has the correct state
        result = self.haa_client.dump().result
        service_id = result.get('service_id')
        service = self.container.resource_registry.read(service_id)
        self.assertEqual(service.state, ServiceStateEnum.STEADY)

        # verifies L4-CI-CEI-RQ122 and L4-CI-CEI-RQ124

        new_policy = {'preserve_n': 2}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 2)

        new_policy = {'preserve_n': 1}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)

        self.assertEqual(len(self.get_running_procs()), 1)

        new_policy = {'preserve_n': 0}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)
        self.assertEqual(len(self.get_running_procs()), 0)

    def test_associations(self):

        # Ensure that once the HA Agent starts, there is a Service object in
        # the registry
        result = self.haa_client.dump().result
        service_id = result.get('service_id')
        self.assertIsNotNone(service_id)
        service = self.container.resource_registry.read(service_id)
        self.assertIsNotNone(service)

        # Ensure that once a process is started, there is an association between
        # it and the service
        new_policy = {'preserve_n': 1}
        self.haa_client.reconfigure_policy(new_policy)
        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)
        self.assertEqual(len(self.get_running_procs()), 1)

        self.await_ha_state('STEADY')

        proc = self.get_running_procs()[0]

        processes_associated, _ = self.container.resource_registry.find_resources(
                restype="Process", name=proc.process_id)
        self.assertEqual(len(processes_associated), 1)

        has_processes = self.container.resource_registry.find_associations(
            service, "hasProcess")
        self.assertEqual(len(has_processes), 1)

        self.await_ha_state('STEADY')

        # Ensure that once we terminate that process, there are no associations
        new_policy = {'preserve_n': 0}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)
        self.assertEqual(len(self.get_running_procs()), 0)

        processes_associated, _ = self.container.resource_registry.find_resources(
                restype="Process", name=proc.process_id)
        self.assertEqual(len(processes_associated), 0)

        has_processes = self.container.resource_registry.find_associations(
            service, "hasProcess")
        self.assertEqual(len(has_processes), 0)

        # Ensure that once we terminate that HA Agent, the Service object is
        # cleaned up
        self.container.terminate_process(self._haa_pid)

        with self.assertRaises(NotFound):
            service = self.container.resource_registry.read(service_id)

    def test_dashi(self):

        import dashi

        dashi_conn = dashi.DashiConnection("something", self._haa_dashi_uri,
            self._haa_dashi_exchange)

        status = dashi_conn.call(self._haa_dashi_name, "status")
        assert status in ('PENDING', 'READY', 'STEADY')

        new_policy = {'preserve_n': 0}
        dashi_conn.call(self._haa_dashi_name, "reconfigure_policy",
            new_policy=new_policy)
示例#10
0
class HighAvailabilityAgentSensorPolicyTest(IonIntegrationTestCase):
    def _start_webserver(self, port=None):
        """ Start a webserver for testing code download
        Note: tries really hard to get a port, and if it can't use
        the suggested port, randomly picks another, and returns it
        """

        def log_message(self, format, *args):
            # swallow log massages
            pass

        class TestRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
            server_version = "test_server"
            extensions_map = ""

            def do_GET(self):
                self.send_response(200)
                self.send_header("Content-type", "text/plain")
                self.send_header("Content-Length", len(self.server.response))
                self.end_headers()
                self.wfile.write(self.server.response)

        class Server(HTTPServer):

            response = ""

            def serve_forever(self):
                self._serving = 1
                while self._serving:
                    self.handle_request()

            def stop(self):
                self._serving = 0

        if port is None:
            port = 8008
        Handler = TestRequestHandler
        Handler.log_message = log_message

        for i in range(0, 100):
            try:
                self._webserver = Server(("localhost", port), Handler)
            except socket.error:
                print "port %s is in use, picking another" % port
                port = randint(8000, 10000)
                continue
            else:
                break

        self._web_glet = gevent.spawn(self._webserver.serve_forever)
        return port

    def _stop_webserver(self):
        if self._webserver is not None:
            self._webserver.stop()
            gevent.sleep(2)
            self._web_glet.kill()

    @needs_epu
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url("res/deploy/r2cei.yml")
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition = ProcessDefinition(
            name="test", executable={"module": "ion.agents.cei.test.test_haagent", "class": "TestProcess"}
        )

        self.pd_cli.create_process_definition(self.process_definition, self.process_definition_id)

        http_port = 8919
        http_port = self._start_webserver(port=http_port)

        self.resource_id = "haagent_4567"
        self._haa_name = "high_availability_agent"
        self._haa_config = {
            "highavailability": {
                "policy": {
                    "interval": 1,
                    "name": "sensor",
                    "parameters": {
                        "metric": "app_attributes:ml",
                        "sample_period": 600,
                        "sample_function": "Average",
                        "cooldown_period": 20,
                        "scale_up_threshold": 2.0,
                        "scale_up_n_processes": 1,
                        "scale_down_threshold": 1.0,
                        "scale_down_n_processes": 1,
                        "maximum_processes": 5,
                        "minimum_processes": 1,
                    },
                },
                "aggregator": {
                    "type": "trafficsentinel",
                    "host": "localhost",
                    "port": http_port,
                    "protocol": "http",
                    "username": "******",
                    "password": "******",
                },
                "process_definition_id": self.process_definition_id,
                "process_dispatchers": ["process_dispatcher"],
            },
            "agent": {"resource_id": self.resource_id},
        }

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(
            name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent",
            config=self._haa_config,
        )

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info("Got haa client %s.", str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)

    def tearDown(self):
        self.waiter.stop()
        self.container.terminate_process(self._haa_pid)
        self._stop_webserver()
        self._stop_container()

    def get_running_procs(self):
        """returns a normalized set of running procs (removes the ones that
        were there at setup time)
        """

        base = self._base_procs
        base_pids = [proc.process_id for proc in base]
        current = self.pd_cli.list_processes()
        current_pids = [proc.process_id for proc in current]
        print "filtering base procs %s from %s" % (base_pids, current_pids)
        normal = [
            cproc
            for cproc in current
            if cproc.process_id not in base_pids and cproc.process_state == ProcessStateEnum.RUNNING
        ]
        return normal

    def _get_managed_upids(self):
        result = self.haa_client.dump().result
        upids = result["managed_upids"]
        return upids

    def _set_response(self, response):
        self._webserver.response = response

    def test_sensor_policy(self):
        status = self.haa_client.status().result
        # Ensure HA hasn't already failed
        assert status in ("PENDING", "READY", "STEADY")

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 1)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, "STEADY")
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"

        # Set ml for each proc such that we scale up
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "%s,ml=5\n"
        self._set_response(response)

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 2)

        # Set ml so we stay steady
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "%s,ml=1.5\n"
        self._set_response(response)

        self.assertEqual(len(self.get_running_procs()), 2)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, "STEADY")
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"

        # Set ml so we scale down
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "%s,ml=0.5\n"
        self._set_response(response)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)

        self.assertEqual(len(self.get_running_procs()), 1)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, "STEADY")
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"
示例#11
0
class BaseHighAvailabilityAgentTest(IonIntegrationTestCase):
    @needs_epu
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        #self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)
        self.pd_cli = ProcessDispatcherServiceClient(
            to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition_name = 'test_haagent_%s' % self.process_definition_id
        self.process_definition = ProcessDefinition(
            name=self.process_definition_name,
            executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
            })
        self.pd_cli.create_process_definition(self.process_definition,
                                              self.process_definition_id)

        service_definition = SERVICE_DEFINITION_TMPL % self.process_definition_name
        sd = IonObject(RT.ServiceDefinition, {
            "name": self.process_definition_name,
            "definition": service_definition
        })
        self.service_def_id, _ = self.container.resource_registry.create(sd)

        self.resource_id = "haagent_1234"
        self._haa_name = "high_availability_agent"
        self._haa_dashi_name = "dashi_haa_" + uuid4().hex
        self._haa_dashi_uri = get_dashi_uri_from_cfg()
        self._haa_dashi_exchange = "hatests"
        self._haa_config = self._get_haagent_config()

        self._base_services, _ = self.container.resource_registry.find_resources(
            restype="Service", name=self.process_definition_name)

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
                                                     name=self.container.name)
        self._spawn_haagent()

        self._setup_haa_client()

    def _get_haagent_config(self):
        return {
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'npreserving',
                    'parameters': {
                        'preserve_n': 0
                    }
                },
                'process_definition_id': self.process_definition_id,
                'dashi_messaging': True,
                'dashi_exchange': self._haa_dashi_exchange,
                'dashi_name': self._haa_dashi_name
            },
            'agent': {
                'resource_id': self.resource_id
            },
        }

    def _setup_haa_client(self):
        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(
            self.resource_id, process=FakeProcess())

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)

    def _spawn_haagent(self, policy_parameters=None):

        config = deepcopy(self._haa_config)
        if policy_parameters is not None:
            config['highavailability']['policy'][
                'parameters'] = policy_parameters
        self._haa_pid = self.container_client.spawn_process(
            name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent",
            config=config)

    def _kill_haagent(self):
        self.container.terminate_process(self._haa_pid)

    def tearDown(self):

        new_policy = {'preserve_n': 0}
        self.haa_client.reconfigure_policy(new_policy)

        self.assertEqual(len(self.get_running_procs()), 0)
        self.await_ha_state('STEADY')

        self.waiter.stop()
        try:
            self._kill_haagent()
        except BadRequest:
            log.warning(
                "Couldn't terminate HA Agent in teardown (May have been terminated by a test)"
            )
        self.container.resource_registry.delete(self.service_def_id,
                                                del_associations=True)
        self._stop_container()

    def get_running_procs(self):
        """returns a normalized set of running procs (removes the ones that
        were there at setup time)
        """

        base = self._base_procs
        base_pids = [proc.process_id for proc in base]
        current = self.pd_cli.list_processes()
        current_pids = [proc.process_id for proc in current]
        print "filtering base procs %s from %s" % (base_pids, current_pids)
        normal = [
            cproc for cproc in current if cproc.process_id not in base_pids
            and cproc.process_state == ProcessStateEnum.RUNNING
        ]
        return normal

    def get_new_services(self):

        base = self._base_services
        base_names = [i.name for i in base]
        services_registered, _ = self.container.resource_registry.find_resources(
            restype="Service", name=self.process_definition_name)
        normal = [
            cserv for cserv in services_registered
            if cserv.name not in base_names
        ]
        return normal

    def await_ha_state(self, want_state, timeout=20):

        for i in range(0, timeout):
            try:
                status = self.haa_client.status().result
                if status == want_state:
                    return
                else:
                    procs = self.get_running_procs()
                    num_procs = len(procs)
                    log.debug(
                        "assert wants state %s, got state %s, with %s procs" %
                        (want_state, status, num_procs))
            except Exception:
                log.exception("Problem getting HA status, trying again...")
                gevent.sleep(1)

        raise Exception("Took more than %s to get to ha state %s" %
                        (timeout, want_state))

    def await_pyon_ha_state(self, want_state, timeout=20):
        for i in range(0, timeout):
            try:
                result = self.haa_client.dump().result
                service_id = result.get('service_id')
                service = self.container.resource_registry.read(service_id)

                if service.state == want_state:
                    return
                else:
                    log.debug("want state %s, got state %s") % (want_state,
                                                                service.state)

            except Exception:
                log.exception("Problem getting HA status, trying again...")
                gevent.sleep(1)

        raise Exception("Took more than %s to get to pyon ha state %s" %
                        (timeout, want_state))