示例#1
0
    def __init__(self, *args, **kwargs):
        BaseContainerAgent.__init__(self, *args, **kwargs)

        # Coordinates the container start
        self._status = INIT

        self._is_started = False
        # set container id and cc_agent name (as they are set in base class call)
        self.id = get_default_container_id()
        self.name = "cc_agent_%s" % self.id
        self.start_time = get_ion_ts()

        bootstrap.container_instance = self
        Container.instance = self
        self.container = self  # Make self appear as process to service clients
        self.CCAP = CCAP
        self.CFG = CFG

        log.debug("Container (sysname=%s) initializing ..." %
                  bootstrap.get_sys_name())

        # Keep track of the overrides from the command-line, so they can trump app/rel file data
        self.spawn_args = kwargs

        # Greenlet context-local storage
        self.context = LocalContextMixin()

        # Load general capabilities file and augment with specific profile
        self._load_capabilities()

        # Start the capabilities
        start_order = self.cap_profile['start_order']
        for cap in start_order:
            if cap not in self._cap_definitions:
                raise ContainerError(
                    "CC capability %s not defined in profile" % cap)
            if cap in self._capabilities or cap in self._cap_instances:
                raise ContainerError("CC capability %s already initialized" %
                                     cap)
            try:
                cap_def = self._cap_definitions[cap]
                log.debug("__init__(): Initializing '%s'" % cap)
                cap_obj = named_any(cap_def['class'])(container=self)
                self._cap_instances[cap] = cap_obj
                if 'depends_on' in cap_def and cap_def['depends_on']:
                    dep_list = cap_def['depends_on'].split(',')
                    for dep in dep_list:
                        dep = dep.strip()
                        if dep not in self._cap_initialized:
                            raise ContainerError(
                                "CC capability %s dependent on non-existing capability %s"
                                % (cap, dep))
                if 'field' in cap_def and cap_def['field']:
                    setattr(self, cap_def['field'], cap_obj)
                self._cap_initialized.append(cap)
            except Exception as ex:
                log.error("Container Capability %s init error: %s" % (cap, ex))
                raise

        log.debug("Container initialized, OK.")
示例#2
0
    def ensure_ready(self, proc, errmsg=None, timeout=10):
        """
        Waits until either the process dies or reports it is ready, whichever comes first.

        If the process dies or times out while waiting for it to be ready, a ContainerError is raised.
        You must be sure the process implements get_ready_event properly, otherwise this method
        returns immediately as the base class behavior simply passes.

        @param  proc        The process to wait on.
        @param  errmsg      A custom error message to put in the ContainerError's message. May be blank.
        @param  timeout     Amount of time (in seconds) to wait for the ready, default 10 seconds.
        @throws ContainerError  If the process dies or if we get a timeout before the process signals ready.
        """

        if isinstance(proc, PythonProcess):
            log.warn("ensure_ready does not yet work on PythonProcesses")
            return True

        if not errmsg:
            errmsg = "ensure_ready failed"

        ev = Event()

        def cb(*args, **kwargs):
            ev.set()

        # link either a greenlet failure due to exception OR a success via ready event
        proc.proc.link_exception(cb)
        proc.get_ready_event().rawlink(cb)

        retval = ev.wait(timeout=timeout)

        # unlink the events: ready event is probably harmless but the exception one, we want to install our own later
        proc.get_ready_event().unlink(cb)

        # if the process is stopped while we are waiting, proc.proc is set to None
        if proc.proc is not None:
            proc.proc.unlink(cb)

        # raise an exception if:
        # - we timed out
        # - we caught an exception
        if not retval:
            raise ContainerError("%s (timed out)" % errmsg)
        elif proc.proc is not None and proc.proc.dead and not proc.proc.successful(
        ):
            raise ContainerError("%s (failed): %s" %
                                 (errmsg, proc.proc.exception))
示例#3
0
    def _stop_capability(self, capability):
        if capability == "CONTAINER_AGENT":
            pass

        elif capability == "APP_MANAGER":
            self.app_manager.stop()

        elif capability == "PROC_MANAGER":
            self.proc_manager.stop()

        elif capability == "EXCHANGE_MANAGER":
            self.ex_manager.stop()

        elif capability == "EVENT_REPOSITORY":
            # close event repository (possible CouchDB connection)
            self.event_repository.close()

        elif capability == "STATE_REPOSITORY":
            # close state repository (possible CouchDB connection)
            self.state_repository.close()

        elif capability == "RESOURCE_REGISTRY":
            # close state resource registry (possible CouchDB connection)
            self.resource_registry.close()

        elif capability == "DIRECTORY":
            # Unregister from directory
            self.directory.unregister_safe("/Containers/%s" % self.id,
                                           "Processes")
            self.directory.unregister_safe("/Containers", self.id)

            # Close directory (possible CouchDB connection)
            self.directory.close()

        elif capability == "DATASTORE_MANAGER":
            # close any open connections to datastores
            self.datastore_manager.stop()

        elif capability == "EXCHANGE_CONNECTION":
            self.node.client.close()
            self.ioloop.kill()
            self.node.client.ioloop.start()  # loop until connection closes
            # destroy AMQP connection

        elif capability == "GOVERNANCE_CONTROLLER":
            self.governance_controller.stop()

        elif capability == "PID_FILE":
            self._cleanup_pid()

        elif capability == "SFLOW_MANAGER":
            self.sflow_manager.stop()

        else:
            raise ContainerError("Cannot stop capability: %s" % capability)
示例#4
0
文件: cc.py 项目: pkediyal/pyon
    def _stop_capability(self, capability):
        if capability == "CONTAINER_AGENT":
            pass

        elif capability == "APP_MANAGER":
            self.app_manager.stop()

        elif capability == "PROC_MANAGER":
            self.proc_manager.stop()

        elif capability == "EXCHANGE_MANAGER":
            self.ex_manager.stop()

        elif capability == "LOCAL_ROUTER":
            if self.local_router is not None:
                self.local_router.stop()

        elif capability == "EVENT_REPOSITORY":
            # close event repository (possible CouchDB connection)
            self.event_repository.close()
            self.event_pub.close()

        elif capability == "STATE_REPOSITORY":
            # close state repository (possible CouchDB connection)
            self.state_repository.close()

        elif capability == "RESOURCE_REGISTRY":
            # close state resource registry (possible CouchDB connection)
            self.resource_registry.close()

        elif capability == "DIRECTORY":
            # Close directory (possible CouchDB connection)
            self.directory.close()

        elif capability == "DATASTORE_MANAGER":
            # close any open connections to datastores
            self.datastore_manager.stop()

        elif capability == "GOVERNANCE_CONTROLLER":
            self.governance_controller.stop()

        elif capability == "PID_FILE":
            self._cleanup_pid()

        elif capability == "SFLOW_MANAGER":
            self.sflow_manager.stop()

        else:
            raise ContainerError("Cannot stop capability: %s" % capability)
示例#5
0
    def start(self):
        # Check if this UNIX process already runs a Container.
        self.container.pidfile = "cc-pid-%d" % os.getpid()
        if os.path.exists(self.container.pidfile):
            raise ContainerError(
                "Container.on_start(): Container is a singleton per UNIX process. Existing pid file found: %s"
                % self.container.pidfile)

        # write out a PID file containing our agent messaging name
        with open(self.container.pidfile, 'w') as f:
            pid_contents = {
                'messaging': dict(CFG.server.amqp),
                'container-agent': self.container.name,
                'container-xp': bootstrap.get_sys_name()
            }
            f.write(msgpack.dumps(pid_contents))
            atexit.register(self.container._cleanup_pid)
示例#6
0
    def start(self):
        log.debug("Container starting...")
        if self._is_started:
            raise ContainerError("Container already started")

        start_order = self.cap_profile['start_order']
        for cap in start_order:
            if cap not in self._cap_instances:
                continue
            # First find the default enabled value if no CFG key exists
            enabled_default = self._cap_definitions.get_safe(
                "%s.enabled_default" % cap, True)
            # Then find CFG key where enabled flag is (default or override)
            enabled_config = self._cap_definitions.get_safe(
                "%s.enabled_config" % cap, "container.%s.enabled" % cap)
            # Then determine the enabled value
            enabled = CFG.get_safe(enabled_config, enabled_default)
            if enabled:
                log.debug("start(): Starting '%s'" % cap)
                try:
                    cap_obj = self._cap_instances[cap]
                    cap_obj.start()
                    self._capabilities.append(cap)
                except Exception as ex:
                    log.error("Container Capability %s start error: %s" %
                              (cap, ex))
                    raise
            else:
                log.debug("start(): Capability '%s' disabled by config '%s'",
                          cap, enabled_config)

        if self.has_capability(CCAP.EVENT_PUBLISHER):
            self.event_pub.publish_event(event_type="ContainerLifecycleEvent",
                                         origin=self.id,
                                         origin_type="CapabilityContainer",
                                         sub_type="START",
                                         state=ContainerStateEnum.START)

        self._is_started = True
        self._status = RUNNING

        log.info("Container (%s) started, OK.", self.id)
示例#7
0
    def _load_capabilities(self):
        self._cap_initialized = [
        ]  # List of capability constants initialized in container
        self._capabilities = [
        ]  # List of capability constants active in container
        self._cap_instances = {}  # Dict mapping capability->manager instance

        self._cap_definitions = Config(
            ["res/config/container_capabilities.yml"]).data['capabilities']

        profile_filename = CFG.get_safe("container.profile", "development")
        if not profile_filename.endswith(".yml"):
            profile_filename = "res/profile/%s.yml" % profile_filename
        log.debug("Loading CC capability profile from file: %s",
                  profile_filename)
        profile_cfg = Config([profile_filename]).data
        if not isinstance(
                profile_cfg, dict
        ) or profile_cfg['type'] != "profile" or not "profile" in profile_cfg:
            raise ContainerError("Container capability profile invalid: %s" %
                                 profile_filename)

        self.cap_profile = profile_cfg['profile']

        if "capabilities" in self.cap_profile and self.cap_profile[
                'capabilities']:
            dict_merge(self._cap_definitions, self.cap_profile['capabilities'],
                       True)

        CCAP.clear()
        cap_list = self._cap_definitions.keys()
        CCAP.update(zip(cap_list, cap_list))

        if "config" in self.cap_profile and self.cap_profile['config']:
            log.info("Container CFG was changed based on profile: %s",
                     profile_filename)
示例#8
0
    def start(self):
        log.debug("Container starting...")
        if self._is_started:
            raise ContainerError("Container already started")

        # Check if this UNIX process already runs a Container.
        self.pidfile = "cc-pid-%d" % os.getpid()
        if os.path.exists(self.pidfile):
            raise ContainerError("Container.on_start(): Container is a singleton per UNIX process. Existing pid file found: %s" % self.pidfile)

        # write out a PID file containing our agent messaging name
        with open(self.pidfile, 'w') as f:
            pid_contents = {'messaging': dict(CFG.server.amqp),
                            'container-agent': self.name,
                            'container-xp': bootstrap.get_sys_name() }
            f.write(msgpack.dumps(pid_contents))
            atexit.register(self._cleanup_pid)
            self._capabilities.append("PID_FILE")

        # set up abnormal termination handler for this container
        def handl(signum, frame):
            try:
                self._cleanup_pid()     # cleanup the pidfile first
                self.quit()             # now try to quit - will not error on second cleanup pidfile call
            finally:
                signal.signal(signal.SIGTERM, self._normal_signal)
                os.kill(os.getpid(), signal.SIGTERM)
        self._normal_signal = signal.signal(signal.SIGTERM, handl)

        self.datastore_manager.start()
        self._capabilities.append("DATASTORE_MANAGER")

        # Self-register with Directory
        self.directory.register("/Containers", self.id, cc_agent=self.name)
        self.directory.register("/Containers/%s" % self.id, "Processes")
        self._capabilities.append("DIRECTORY")

        # Event repository
        self.event_repository = EventRepository()
        self.event_pub = EventPublisher()

        self._capabilities.append("EVENT_REPOSITORY")

        # Local resource registry
        self.resource_registry = ResourceRegistry()
        self._capabilities.append("RESOURCE_REGISTRY")

        # Persistent objects
        self.datastore_manager.get_datastore("objects", DataStore.DS_PROFILE.OBJECTS)

        # State repository
        self.state_repository = StateRepository()
        self._capabilities.append("STATE_REPOSITORY")

        # Start ExchangeManager, which starts the node (broker connection)
        self.ex_manager.start()
        self._capabilities.append("EXCHANGE_MANAGER")

        self.proc_manager.start()
        self._capabilities.append("PROC_MANAGER")

        self.app_manager.start()
        self._capabilities.append("APP_MANAGER")

        self.governance_controller.start()
        self._capabilities.append("GOVERNANCE_CONTROLLER")

        if CFG.container.get('sflow', {}).get('enabled', False):
            self.sflow_manager.start()
            self._capabilities.append("SFLOW_MANAGER")

        # Start the CC-Agent API
        rsvc = ProcessRPCServer(node=self.node, from_name=self.name, service=self, process=self)

        # Start an ION process with the right kind of endpoint factory
        proc = self.proc_manager.proc_sup.spawn(name=self.name, listeners=[rsvc], service=self)
        self.proc_manager.proc_sup.ensure_ready(proc)
        self._capabilities.append("CONTAINER_AGENT")

        self.event_pub.publish_event(event_type="ContainerLifecycleEvent",
                                     origin=self.id, origin_type="CapabilityContainer",
                                     sub_type="START",
                                     state=ContainerStateEnum.START)

        self._is_started    = True
        self._status        = "RUNNING"

        log.info("Container started, OK.")
示例#9
0
文件: process.py 项目: pkediyal/pyon
    def _control_flow(self):
        """
        Main process thread of execution method.

        This method is run inside a greenlet and exists for each ION process. Listeners
        attached to the process, either RPC Servers or Subscribers, synchronize their calls
        by placing future calls into the queue by calling _routing_call.  This is all done
        automatically for you by the Container's Process Manager.

        This method blocks until there are calls to be made in the synchronized queue, and
        then calls from within this greenlet.  Any exception raised is caught and re-raised
        in the greenlet that originally scheduled the call.  If successful, the AsyncResult
        created at scheduling time is set with the result of the call.
        """
        if self.name:
            svc_name = "unnamed-service"
            if self.service is not None and hasattr(self.service, 'name'):
                svc_name = self.service.name
            threading.current_thread().name = "%s-%s-ctrl" % (svc_name,
                                                              self.name)

        self._ready_control.set()

        for calltuple in self._ctrl_queue:
            calling_gl, ar, call, callargs, callkwargs, context = calltuple
            log.debug("control_flow making call: %s %s %s (has context: %s)",
                      call, callargs, callkwargs, context is not None)

            res = None
            start_proc_time = int(get_ion_ts())

            # check context for expiration
            if context is not None and 'reply-by' in context:
                if start_proc_time >= int(context['reply-by']):
                    log.info(
                        "control_flow: attempting to process message already exceeding reply-by, ignore"
                    )

                    # raise a timeout in the calling thread to allow endpoints to continue processing
                    e = IonTimeout(
                        "Reply-by time has already occurred (reply-by: %s, op start time: %s)"
                        % (context['reply-by'], start_proc_time))
                    calling_gl.kill(exception=e, block=False)

                    continue

            # also check ar if it is set, if it is, that means it is cancelled
            if ar.ready():
                log.info(
                    "control_flow: attempting to process message that has been cancelled, ignore"
                )
                continue

            try:
                with self.service.push_context(context):
                    with self.service.container.context.push_context(context):
                        self._ctrl_current = ar
                        res = call(*callargs, **callkwargs)
            except OperationInterruptedException:
                # endpoint layer takes care of response as it's the one that caused this
                log.debug("Operation interrupted")
                pass
            except Exception as e:
                # raise the exception in the calling greenlet, and don't
                # wait for it to die - it's likely not going to do so.

                # try decorating the args of the exception with the true traceback
                # this should be reported by ThreadManager._child_failed
                exc = PyonThreadTraceback(
                    "IonProcessThread _control_flow caught an exception (call: %s, *args %s, **kwargs %s, context %s)\nTrue traceback captured by IonProcessThread' _control_flow:\n\n%s"
                    % (call, callargs, callkwargs, context,
                       traceback.format_exc()))
                e.args = e.args + (exc, )

                # HACK HACK HACK
                # we know that we only handle TypeError and IonException derived things, so only forward those if appropriate
                if isinstance(e, (TypeError, IonException)):
                    calling_gl.kill(exception=e, block=False)
                else:
                    # otherwise, swallow/record/report and hopefully we can continue on our way
                    self._errors.append(
                        (call, callargs, callkwargs, context, e, exc))

                    log.warn(exc)
                    log.warn("Attempting to continue...")

                    # have to raise something friendlier on the client side
                    calling_gl.kill(exception=ContainerError(str(exc)),
                                    block=False)
            finally:
                proc_time = int(get_ion_ts()) - start_proc_time
                self._proc_time += proc_time

                self._ctrl_current = None

            ar.set(res)
示例#10
0
    def _control_flow(self):
        """
        Entry point for process control thread of execution.

        This method is run by the control greenlet for each ION process. Listeners attached
        to the process, either RPC Servers or Subscribers, synchronize calls to the process
        by placing call requests into the queue by calling _routing_call.

        This method blocks until there are calls to be made in the synchronized queue, and
        then calls from within this greenlet.  Any exception raised is caught and re-raised
        in the greenlet that originally scheduled the call.  If successful, the AsyncResult
        created at scheduling time is set with the result of the call.
        """
        svc_name = getattr(
            self.service, "name",
            "unnamed-service") if self.service else "unnamed-service"
        proc_id = getattr(self.service, "id",
                          "unknown-pid") if self.service else "unknown-pid"
        if self.name:
            threading.current_thread().name = "%s-%s" % (svc_name, self.name)
        thread_base_name = threading.current_thread().name

        self._ready_control.set()

        for calltuple in self._ctrl_queue:
            calling_gl, ar, call, callargs, callkwargs, context = calltuple
            request_id = (context or {}).get("request-id", None)
            if request_id:
                threading.current_thread(
                ).name = thread_base_name + "-" + str(request_id)
            #log.debug("control_flow making call: %s %s %s (has context: %s)", call, callargs, callkwargs, context is not None)

            res = None
            start_proc_time = get_ion_ts_millis()
            self._record_proc_time(start_proc_time)

            # check context for expiration
            if context is not None and 'reply-by' in context:
                if start_proc_time >= int(context['reply-by']):
                    log.info(
                        "control_flow: attempting to process message already exceeding reply-by, ignore"
                    )

                    # raise a timeout in the calling thread to allow endpoints to continue processing
                    e = IonTimeout(
                        "Reply-by time has already occurred (reply-by: %s, op start time: %s)"
                        % (context['reply-by'], start_proc_time))
                    calling_gl.kill(exception=e, block=False)

                    continue

            # If ar is set, means it is cancelled
            if ar.ready():
                log.info(
                    "control_flow: attempting to process message that has been cancelled, ignore"
                )
                continue

            init_db_stats()
            try:
                # ******************************************************************
                # ****** THIS IS WHERE THE RPC OPERATION/SERVICE CALL IS MADE ******

                with self.service.push_context(context), \
                     self.service.container.context.push_context(context):
                    self._ctrl_current = ar
                    res = call(*callargs, **callkwargs)

                # ****** END CALL, EXCEPTION HANDLING FOLLOWS                 ******
                # ******************************************************************

            except OperationInterruptedException:
                # endpoint layer takes care of response as it's the one that caused this
                log.debug("Operation interrupted")
                pass

            except Exception as e:
                if self._log_call_exception:
                    log.exception("PROCESS exception: %s" % e.message)

                # Raise the exception in the calling greenlet.
                # Try decorating the args of the exception with the true traceback -
                # this should be reported by ThreadManager._child_failed
                exc = PyonThreadTraceback(
                    "IonProcessThread _control_flow caught an exception "
                    "(call: %s, *args %s, **kwargs %s, context %s)\n"
                    "True traceback captured by IonProcessThread' _control_flow:\n\n%s"
                    % (call, callargs, callkwargs, context,
                       traceback.format_exc()))
                e.args = e.args + (exc, )

                if isinstance(e, (TypeError, IonException)):
                    # Pass through known process exceptions, in particular IonException
                    calling_gl.kill(exception=e, block=False)
                else:
                    # Otherwise, wrap unknown, forward and hopefully we can continue on our way
                    self._errors.append(
                        (call, callargs, callkwargs, context, e, exc))

                    log.warn(exc)
                    log.warn("Attempting to continue...")

                    # Note: Too large exception string will crash the container (when passed on as msg header).
                    exception_str = str(exc)
                    if len(exception_str) > 10000:
                        exception_str = (
                            "Exception string representation too large. "
                            "Begin and end of the exception:\n" +
                            exception_str[:2000] + "\n...\n" +
                            exception_str[-2000:])
                    calling_gl.kill(exception=ContainerError(exception_str),
                                    block=False)
            finally:
                try:
                    # Compute statistics
                    self._compute_proc_stats(start_proc_time)

                    db_stats = get_db_stats()
                    if db_stats:
                        if self._warn_call_dbstmt_threshold > 0 and db_stats.get(
                                "count.all",
                                0) >= self._warn_call_dbstmt_threshold:
                            stats_str = ", ".join(
                                "{}={}".format(k, db_stats[k])
                                for k in sorted(db_stats.keys()))
                            log.warn(
                                "PROC_OP '%s.%s' EXCEEDED DB THRESHOLD. stats=%s",
                                svc_name, call.__name__, stats_str)
                        elif self._log_call_dbstats:
                            stats_str = ", ".join(
                                "{}={}".format(k, db_stats[k])
                                for k in sorted(db_stats.keys()))
                            log.info("PROC_OP '%s.%s' DB STATS: %s", svc_name,
                                     call.__name__, stats_str)
                    clear_db_stats()

                    if stats_callback:
                        stats_callback(proc_id=proc_id,
                                       proc_name=self.name,
                                       svc=svc_name,
                                       op=call.__name__,
                                       request_id=request_id,
                                       context=context,
                                       db_stats=db_stats,
                                       proc_stats=self.time_stats,
                                       result=res,
                                       exc=None)
                except Exception:
                    log.exception("Error computing process call stats")

                self._ctrl_current = None
                threading.current_thread().name = thread_base_name

            # Set response in AsyncEvent of caller (endpoint greenlet)
            ar.set(res)