Exemplo n.º 1
0
    def start(self):
        queue_name = get_safe(self._pd_core.pd_cfg, "command_queue") or "pd_command"
        self.sub_cont = Subscriber(binding=queue_name, from_name=queue_name, callback=self._receive_command)
        self.sub_cont_gl = spawn(self.sub_cont.listen, activate=False)
        self.sub_cont.get_ready_event().wait()

        self.pub_result = Publisher()
Exemplo n.º 2
0
class ProcessDispatcherAggregator(object):
    """ PD aggregator for heartbeat input from containers etc. """

    def __init__(self, pd_core):
        self._pd_core = pd_core
        self.container = self._pd_core.container
        self.registry = self._pd_core.registry

    def start(self):
        # Create our own queue for container heartbeats and broadcasts
        topic = get_safe(self._pd_core.pd_cfg, "aggregator.container_topic") or "bx_containers"
        queue_name = "pd_aggregator_%s_%s" % (topic, create_valid_identifier(self.container.id, dot_sub="_"))
        self.sub_cont = Subscriber(binding=topic, from_name=queue_name, auto_delete=True,
                                   callback=self._receive_container_info)
        self.sub_cont_gl = spawn(self.sub_cont.listen)
        self.sub_cont.get_ready_event().wait()

        self.evt_sub = EventSubscriber(event_type=OT.ContainerLifecycleEvent, callback=self._receive_event)
        self.evt_sub.add_event_subscription(event_type=OT.ProcessLifecycleEvent)
        self.evt_sub_gl = spawn(self.evt_sub.listen)
        self.evt_sub.get_ready_event().wait()

        log.info("PD Aggregator - event and heartbeat subscribers started")

    def stop(self):
        # Stop subscribers
        self.sub_cont.close()
        self.evt_sub.close()

        # Wait for subscribers to finish
        self.sub_cont_gl.join(timeout=2)
        self.sub_cont_gl.kill()
        self.sub_cont_gl = None

        self.evt_sub_gl.join(timeout=2)
        self.evt_sub_gl.kill()
        self.evt_sub_gl = None

    # -------------------------------------------------------------------------

    def _receive_container_info(self, msg, headers, *args):
        log.debug("Got container info %s %s %s", msg, headers, args)
        if not isinstance(msg, ContainerHeartbeat):
            log.warn("Unknown container info format")
            return

        self.registry.register_container(msg.container_id, msg.ts, EE_STATE_RUNNING, msg.attributes)

    def _receive_event(self, event, *args, **kwargs):
        log.debug("Got event %s %s %s", event, args, kwargs)
        if isinstance(event, ContainerLifecycleEvent):
            if event.sub_type == "START":
                self.registry.register_container(event.origin, event.ts_created, EE_STATE_RUNNING, {})
            elif event.sub_type == "TERMINATE":
                self.registry.register_container(event.origin, event.ts_created, EE_STATE_TERMINATED, {})

        elif isinstance(event, ProcessLifecycleEvent):
            proc_info = dict(state=event.state, proc_type=event.process_type, proc_name=event.process_name,
                             resource_id=event.process_resource_id, service_name=event.service_name)
            self.registry.register_process(event.container_id, event.origin, proc_info)
Exemplo n.º 3
0
    def start(self):
        # Create our own queue for container heartbeats and broadcasts
        topic = get_safe(self._pd_core.pd_cfg,
                         "aggregator.container_topic") or "bx_containers"
        queue_name = "pd_aggregator_%s_%s" % (
            topic, create_valid_identifier(self.container.id, dot_sub="_"))
        self.sub_cont = Subscriber(binding=topic,
                                   from_name=queue_name,
                                   auto_delete=True,
                                   callback=self._receive_container_info)
        self.sub_cont_gl = spawn(self.sub_cont.listen)
        self.sub_cont.get_ready_event().wait()

        self.evt_sub = EventSubscriber(event_type=OT.ContainerLifecycleEvent,
                                       callback=self._receive_event)
        self.evt_sub.add_event_subscription(
            event_type=OT.ProcessLifecycleEvent)
        self.evt_sub_gl = spawn(self.evt_sub.listen)
        self.evt_sub.get_ready_event().wait()

        log.info("PD Aggregator - event and heartbeat subscribers started")
Exemplo n.º 4
0
    def start(self):
        # Create our own queue for container heartbeats and broadcasts
        topic = get_safe(self._pd_core.pd_cfg, "aggregator.container_topic") or "bx_containers"
        queue_name = "pd_aggregator_%s_%s" % (topic, create_valid_identifier(self.container.id, dot_sub="_"))
        self.sub_cont = Subscriber(binding=topic, from_name=queue_name, auto_delete=True,
                                   callback=self._receive_container_info)
        self.sub_cont_gl = spawn(self.sub_cont.listen)
        self.sub_cont.get_ready_event().wait()

        self.evt_sub = EventSubscriber(event_type=OT.ContainerLifecycleEvent, callback=self._receive_event)
        self.evt_sub.add_event_subscription(event_type=OT.ProcessLifecycleEvent)
        self.evt_sub_gl = spawn(self.evt_sub.listen)
        self.evt_sub.get_ready_event().wait()

        log.info("PD Aggregator - event and heartbeat subscribers started")
Exemplo n.º 5
0
class ProcessDispatcherDecisionEngine(object):
    """ Base class for PD decision engines """

    def __init__(self, pd_core):
        self._pd_core = pd_core
        self.container = self._pd_core.container
        self.registry = self._pd_core.registry
        self.executor = self._pd_core.executor

        self.sub_cont = None
        self.sub_cont_gl = None
        self.sub_active = False

        self._pd_core.leader_manager.add_leader_callback(self._leader_callback)

        self._load_rules()

    def start(self):
        queue_name = get_safe(self._pd_core.pd_cfg, "command_queue") or "pd_command"
        self.sub_cont = Subscriber(binding=queue_name, from_name=queue_name, callback=self._receive_command)
        self.sub_cont_gl = spawn(self.sub_cont.listen, activate=False)
        self.sub_cont.get_ready_event().wait()

        self.pub_result = Publisher()

    def stop(self):
        if self.sub_cont:
            self.sub_cont.close()
            self.sub_cont_gl.join(timeout=2)
            self.sub_cont_gl.kill()
            self.sub_cont_gl = None
            self.sub_cont = None

    def _leader_callback(self, leader_info):
        if leader_info["action"] == "acquire_leader":
            def start_sub():
                if not self.registry.preconditions_true.is_set():
                    log.info("PD is leader - awaiting PD preconditions")
                    # Await preconditions
                    await_timeout = get_safe(self._pd_core.pd_cfg, "engine.await_preconditions.await_timeout")
                    precond_true = self.registry.preconditions_true.wait(timeout=await_timeout)
                    if not precond_true:
                        log.warn("PD preconditions not satisfied after timeout - continuing")

                if self._pd_core.is_leader() and self.sub_cont is not None and not self.sub_active:
                    # Are we still leader? Not activated?
                    num_msg, num_cons = self.sub_cont.get_stats()
                    log.info("PD is leader - starting to consume (%s pending commands, %s consumers)", num_msg, num_cons)
                    self.sub_cont.activate()
                    self.sub_active = True
            start_sub_gl = spawn(start_sub)
        elif leader_info["action"] == "release_leader":
            if self.sub_cont is not None and self.sub_active:
                self.sub_cont.deactivate()
                self.sub_active = False

    def _receive_command(self, command, headers, *args):
        log.info("PD execute command %s", command["command"])

        cmd_funcname = "_cmd_{}".format(command["command"])
        cmd_func = getattr(self, cmd_funcname, None)
        cmd_res, cmd_complete = None, False
        if not cmd_func:
            log.warn("Command function not found")
            self._send_command_reply(command, dict(message="ERROR"), status=400)
            return
        try:
            cmd_res = cmd_func(command)
            cmd_complete = True
        except Exception as ex:
            log.exception("Error executing command")
            self._send_command_reply(command, dict(message=str(ex)), status=400)

        if cmd_complete:
            self._send_command_reply(command, cmd_res)

    def _send_command_reply(self, command, result=None, status=200):
        reply_to = command.get("reply_to", None)
        if not reply_to:
            return
        res_msg = AsyncResultMsg(request_id=command["command_id"], result=result, status=status)
        self.pub_result.publish(to_name=reply_to, msg=res_msg)

    # -------------------------------------------------------------------------

    def _cmd_start_rel(self, command):
        log.debug("Start rel")
        rel = command["rel_def"]

        max_proc_replicas = int(CFG.get_safe("container.process.max_replicas", 0))

        for rel_app_cfg in rel["apps"]:
            app_name = rel_app_cfg["name"]
            log.debug("app definition in rel: %s", str(rel_app_cfg))

            # Decide where process should go
            target_engine = self._determine_target_engine(rel_app_cfg)
            log.debug("Dispatch app %s to engine %s", app_name, target_engine)

            if 'processapp' in rel_app_cfg:
                name, module, cls = rel_app_cfg["processapp"]

                rel_cfg = None
                if 'config' in rel_app_cfg:
                    rel_cfg = deepcopy(rel_app_cfg["config"])

                if 'replicas' in rel_app_cfg:
                    proc_replicas = int(rel_app_cfg["replicas"])
                    if max_proc_replicas > 0:
                        if proc_replicas > max_proc_replicas:
                            log.info("Limiting number of proc replicas to %s from %s", max_proc_replicas, proc_replicas)
                        proc_replicas = min(proc_replicas, max_proc_replicas)
                    if proc_replicas < 1 or proc_replicas > 100:
                        log.warn("Invalid number of process replicas: %s", proc_replicas)
                        proc_replicas = 1
                    for i in xrange(proc_replicas):
                        cont_info = self._determine_target_container(rel_app_cfg, target_engine)
                        container_name = self._get_cc_agent_name(cont_info)
                        proc_name = "%s.%s" % (name, i) if i else name
                        action_res = self._add_spawn_process_action(cc_agent=container_name, proc_name=proc_name,
                                                                    module=module, cls=cls, config=rel_cfg)
                        proc_id = action_res.wait()
                        self._slot_process(cont_info, proc_id, dict(proc_name=proc_name, state=ProcessStateEnum.RUNNING))
                else:
                    cont_info = self._determine_target_container(rel_app_cfg, target_engine)
                    container_name = self._get_cc_agent_name(cont_info)
                    action_res = self._add_spawn_process_action(cc_agent=container_name, proc_name=name,
                                                                module=module, cls=cls, config=rel_cfg)
                    proc_id = action_res.wait()
                    self._slot_process(cont_info, proc_id, dict(proc_name=name, state=ProcessStateEnum.RUNNING))

            else:
                log.warn("App file not supported")

    def _add_spawn_process_action(self, cc_agent, proc_name, module, cls, config):
        action_res = AsyncResult()
        action_kwargs = dict(cc_agent=cc_agent, proc_name=proc_name, module=module, cls=cls, config=config)
        action = ("spawn_process", action_res, action_kwargs)
        self.executor.add_action(action)
        return action_res

    # -------------------------------------------------------------------------

    def _load_rules(self):
        self.rules_cfg = get_safe(self._pd_core.pd_cfg, "engine.dispatch_rules") or []
        self.default_engine = get_safe(self._pd_core.pd_cfg, "engine.default_engine") or "default"

    def _determine_target_engine(self, app_cfg):

        # Determine nominal engine
        target_engine = self.default_engine
        app_name = app_cfg["name"]
        for rule in self.rules_cfg:
            if "appname_pattern" in rule:
                if re.match(rule["appname_pattern"], app_name):
                    target_engine = rule["engine"]

        # For check if engine has actual containers
        ee_containers = self.registry.get_engine_containers()
        if not ee_containers.get(target_engine, None):
            target_engine = self.default_engine

        return target_engine

    def _determine_target_container(self, app_cfg, target_engine):
        app_name = app_cfg["name"]
        if target_engine is None:
            target_engine = self._determine_target_engine(app_cfg)

        # Determine available containers
        ee_containers = self.registry.get_engine_containers()
        ee_conts = ee_containers.get(target_engine, None)
        if not ee_conts:
            raise BadRequest("No running containers for app {}".format(app_name))
        ee_conts_sorted = sorted(ee_conts, key=lambda c: c["ts_created"])

        # Load balance across containers
        dispatch_spread = get_safe(self._pd_core.pd_cfg, "engine.dispatch_spread") or "round_robin"
        if dispatch_spread == "round_robin":
            cont_info, current_min_alloc = None, 9999999
            for cont in ee_conts_sorted:
                if len(cont["allocation"]) < current_min_alloc:
                    max_capacity = int(get_safe(cont["ee_info"], "capacity.max", 0))
                    if max_capacity and len(cont["allocation"]) < max_capacity:
                        cont_info = cont
                        current_min_alloc = len(cont["allocation"])
            if not cont_info:
                raise BadRequest("Could not find open slot")
        elif dispatch_spread == "fill_up":
            cont_info, current_min_alloc = None, 9999999
            for cont in ee_conts_sorted:
                if len(cont["allocation"]) < int(get_safe(cont["ee_info"], "capacity.max", 0)):
                    cont_info = cont
                    break
            if not cont_info:
                raise BadRequest("Could not find open slot")
        elif dispatch_spread == "random":
            cont_num = random.randint(0, len(ee_conts_sorted)-1)
            cont_info = ee_conts_sorted[cont_num]
        else:
            raise BadRequest("Unknown dispatch_spread {}".format(dispatch_spread))

        return cont_info

    def _get_cc_agent_name(self, cont_info):
        container_name = cont_info["cc_obj"].cc_agent
        return container_name

    def _slot_process(self, cont_info, proc_id, proc_info):
        self.registry.register_process(cont_info["container_id"], proc_id, proc_info, update=False)
Exemplo n.º 6
0
class ProcessDispatcherAggregator(object):
    """ PD aggregator for heartbeat input from containers etc. """
    def __init__(self, pd_core):
        self._pd_core = pd_core
        self.container = self._pd_core.container
        self.registry = self._pd_core.registry

    def start(self):
        # Create our own queue for container heartbeats and broadcasts
        topic = get_safe(self._pd_core.pd_cfg,
                         "aggregator.container_topic") or "bx_containers"
        queue_name = "pd_aggregator_%s_%s" % (
            topic, create_valid_identifier(self.container.id, dot_sub="_"))
        self.sub_cont = Subscriber(binding=topic,
                                   from_name=queue_name,
                                   auto_delete=True,
                                   callback=self._receive_container_info)
        self.sub_cont_gl = spawn(self.sub_cont.listen)
        self.sub_cont.get_ready_event().wait()

        self.evt_sub = EventSubscriber(event_type=OT.ContainerLifecycleEvent,
                                       callback=self._receive_event)
        self.evt_sub.add_event_subscription(
            event_type=OT.ProcessLifecycleEvent)
        self.evt_sub_gl = spawn(self.evt_sub.listen)
        self.evt_sub.get_ready_event().wait()

        log.info("PD Aggregator - event and heartbeat subscribers started")

    def stop(self):
        # Stop subscribers
        self.sub_cont.close()
        self.evt_sub.close()

        # Wait for subscribers to finish
        self.sub_cont_gl.join(timeout=2)
        self.sub_cont_gl.kill()
        self.sub_cont_gl = None

        self.evt_sub_gl.join(timeout=2)
        self.evt_sub_gl.kill()
        self.evt_sub_gl = None

    # -------------------------------------------------------------------------

    def _receive_container_info(self, msg, headers, *args):
        log.debug("Got container info %s %s %s", msg, headers, args)
        if not isinstance(msg, ContainerHeartbeat):
            log.warn("Unknown container info format")
            return

        self.registry.register_container(msg.container_id, msg.ts,
                                         EE_STATE_RUNNING, msg.attributes)

    def _receive_event(self, event, *args, **kwargs):
        log.debug("Got event %s %s %s", event, args, kwargs)
        if isinstance(event, ContainerLifecycleEvent):
            if event.sub_type == "START":
                self.registry.register_container(event.origin,
                                                 event.ts_created,
                                                 EE_STATE_RUNNING, {})
            elif event.sub_type == "TERMINATE":
                self.registry.register_container(event.origin,
                                                 event.ts_created,
                                                 EE_STATE_TERMINATED, {})

        elif isinstance(event, ProcessLifecycleEvent):
            proc_info = dict(state=event.state,
                             proc_type=event.process_type,
                             proc_name=event.process_name,
                             resource_id=event.process_resource_id,
                             service_name=event.service_name)
            self.registry.register_process(event.container_id, event.origin,
                                           proc_info)