def __init__(self, container): self.container = container # Define the callables that can be added to Container public API self.container_api = [self.spawn_process, self.terminate_process] # Add the public callables to Container for call in self.container_api: setattr(self.container, call.__name__, call) self.proc_id_pool = IDPool() # Temporary registry of running processes self.procs_by_name = {} self.procs = {} # mapping of greenlets we spawn to process_instances for error handling self._spawned_proc_to_process = {} # The pyon worker process supervisor self.proc_sup = IonProcessThreadManager( heartbeat_secs=CFG.cc.timeout.heartbeat, failure_notify_callback=self._spawned_proc_failed) # list of callbacks for process state changes self._proc_state_change_callbacks = []
def __init__(self, container): self.container = container # Define the callables that can be added to Container public API self.container_api = [self.spawn_process, self.terminate_process] # Add the public callables to Container for call in self.container_api: setattr(self.container, call.__name__, call) self.proc_id_pool = IDPool() # Temporary registry of running processes self.procs_by_name = {} self.procs = {} # mapping of greenlets we spawn to process_instances for error handling self._spawned_proc_to_process = {} # The pyon worker process supervisor self.proc_sup = IonProcessThreadManager( heartbeat_secs=CFG.cc.timeout.heartbeat, failure_notify_callback=self._spawned_proc_failed ) # list of callbacks for process state changes self._proc_state_change_callbacks = []
def __init__(self, container): self.container = container # Define the callables that can be added to Container public API, and add self.container_api = [self.spawn_process, self.terminate_process] for call in self.container_api: setattr(self.container, call.__name__, call) self.proc_id_pool = IDPool() # Registry of running processes self.procs = {} self.procs_by_name = {} # BAD: This is not correct if procs have the same name # mapping of greenlets we spawn to process_instances for error handling self._spawned_proc_to_process = {} # Effective execution engine config (after merging in child process overrides) self.ee_cfg = self._get_execution_engine_config() # Process dispatcher (if configured/enabled and not a child container process) self.pd_cfg = CFG.get_safe("service.process_dispatcher") or {} self.pd_enabled = self.pd_cfg.get("enabled", False) is True and not self.ee_cfg["container"]["is_child"] self.pd_core = None self.event_pub = EventPublisher() self.publish_events = CFG.get_safe("container.process.publish_events") is True # Passive manager for simple threads/greenlets, to keep them registered (these are not OS threads) # Note that each ION process has its own thread manager, so this is for container level threads self.thread_manager = ThreadManager(heartbeat_secs=None, failure_notify_callback=None) # Active supervisor for ION processes self.proc_sup = IonProcessThreadManager(heartbeat_secs=CFG.get_safe("container.timeout.heartbeat"), failure_notify_callback=self._spawned_proc_failed) # list of callbacks for process state changes self._proc_state_change_callbacks = []
class ProcManager(object): def __init__(self, container): self.container = container # Define the callables that can be added to Container public API, and add self.container_api = [self.spawn_process, self.terminate_process] for call in self.container_api: setattr(self.container, call.__name__, call) self.proc_id_pool = IDPool() # Registry of running processes self.procs = {} self.procs_by_name = {} # BAD: This is not correct if procs have the same name # mapping of greenlets we spawn to process_instances for error handling self._spawned_proc_to_process = {} # Effective execution engine config (after merging in child process overrides) self.ee_cfg = self._get_execution_engine_config() # Process dispatcher (if configured/enabled and not a child container process) self.pd_cfg = CFG.get_safe("service.process_dispatcher") or {} self.pd_enabled = self.pd_cfg.get("enabled", False) is True and not self.ee_cfg["container"]["is_child"] self.pd_core = None self.event_pub = EventPublisher() self.publish_events = CFG.get_safe("container.process.publish_events") is True # Passive manager for simple threads/greenlets, to keep them registered (these are not OS threads) # Note that each ION process has its own thread manager, so this is for container level threads self.thread_manager = ThreadManager(heartbeat_secs=None, failure_notify_callback=None) # Active supervisor for ION processes self.proc_sup = IonProcessThreadManager(heartbeat_secs=CFG.get_safe("container.timeout.heartbeat"), failure_notify_callback=self._spawned_proc_failed) # list of callbacks for process state changes self._proc_state_change_callbacks = [] def start(self): log.debug("ProcManager starting ...") if self.pd_enabled: self._start_process_dispatcher() self.proc_sup.start() if self.container.has_capability(self.container.CCAP.RESOURCE_REGISTRY): # Register container as resource object cc_obj = self._get_capability_container_object() self.cc_id, _ = self.container.resource_registry.create(cc_obj) # Create an association to an Org object if not the rot ION org and only if found if CFG.get_safe("container.org_name") != CFG.get_safe("system.root_org"): org, _ = self.container.resource_registry.find_resources( restype=RT.Org, name=CFG.get_safe("container.org_name"), id_only=True) if org: self.container.resource_registry.create_association(org[0], PRED.hasResource, self.cc_id) # TODO - replace with proper association log.debug("ProcManager started, OK.") def stop(self): log.debug("ProcManager stopping ...") # Call quit on procs to give them ability to clean up in reverse order procs_list = sorted(self.procs.values(), key=lambda proc: proc._proc_start_time, reverse=True) for proc in procs_list: try: self.terminate_process(proc.id) except Exception as ex: log.warn("Failed to terminate process (%s): %s", proc.id, ex) # TODO: Have a choice of shutdown behaviors for waiting on children, timeouts, etc self.proc_sup.shutdown(CFG.get_safe("container.timeout.shutdown")) if self.procs: log.warn("ProcManager procs not empty: %s", self.procs) if self.procs_by_name: log.warn("ProcManager procs_by_name not empty: %s", self.procs_by_name) # Remove Resource registration if self.container.has_capability(self.container.CCAP.RESOURCE_REGISTRY): try: self.container.resource_registry.delete(self.cc_id, del_associations=True) except NotFound: # already gone, this is ok pass if self.pd_enabled: self._stop_process_dispatcher() log.debug("ProcManager stopped, OK.") def _get_execution_engine_config(self): ee_base_cfg = CFG.get_safe("container.execution_engine") or {} if ee_base_cfg.get("type", None) != "scioncc": raise ContainerConfigError("Execution engine config invalid: %s", ee_base_cfg) ee_cfg = deepcopy(ee_base_cfg) # If we are a child process, merge in child config override proc_name = multiprocessing.current_process().name ee_cfg["container"] = dict(child_proc_name=proc_name, is_child=False) child_cfgs = ee_base_cfg.get("child_configs", None) or {} if proc_name.startswith("Container-child-"): ee_cfg["container"]["is_child"] = True if proc_name in child_cfgs: log.info("Applying execution engine config override for child: %s", proc_name) dict_merge(ee_cfg, child_cfgs[proc_name], inplace=True) else: for cfg_name, ch_cfg in child_cfgs.iteritems(): pattern = ch_cfg.get("name_pattern", None) if pattern and re.match(pattern, proc_name): log.info("Applying execution engine config override %s for child: %s", cfg_name, proc_name) dict_merge(ee_cfg, ch_cfg, inplace=True) break ee_cfg.pop("child_configs", None) return ee_cfg def _get_capability_container_object(self): container_info = dict(proc_name=multiprocessing.current_process().name, process_id=os.getpid(), parent_process_id=os.getppid(), hostname=socket.gethostname(), host=socket.gethostbyname(socket.gethostname()), platform=sys.platform, argv=sys.argv, python_version=sys.version, cwd=os.getcwd(), start_time=self.container.start_time, ) # Other possibilities: username, os package versions, IP address host_info = {k: v for (k, v) in zip(("os_sysname", "os_nodename", "os_release", "os_version", "os_machine"), os.uname())} container_info.update(host_info) container_info["env"] = {k: str(v) for (k,v) in os.environ.iteritems()} container_info["python_path"] = sys.path cc_obj = CapabilityContainer(name=self.container.id, version=self.container.version, cc_agent=self.container.name, container_info=container_info, execution_engine_config=self.ee_cfg) return cc_obj # ----------------------------------------------------------------- def spawn_process(self, name=None, module=None, cls=None, config=None, process_id=None): """ Spawn a process within the container. Processes can be of different type. """ if process_id and not is_valid_identifier(process_id, ws_sub='_'): raise BadRequest("Given process_id %s is not a valid identifier" % process_id) # PROCESS ID. Generate a new process id if not provided # TODO: Ensure it is system-wide unique process_id = process_id or "%s.%s" % (self.container.id, self.proc_id_pool.get_id()) log.debug("ProcManager.spawn_process(name=%s, module.cls=%s.%s, config=%s) as pid=%s", name, module, cls, config, process_id) # CONFIG process_cfg = self._create_process_config(config) try: service_cls = named_any("%s.%s" % (module, cls)) except AttributeError as ae: # Try to nail down the error import importlib importlib.import_module(module) raise # PROCESS TYPE. Determines basic process context (messaging, service interface) process_type = get_safe(process_cfg, "process.type") or getattr(service_cls, "process_type", PROCTYPE_SERVICE) process_start_mode = get_safe(config, "process.start_mode") process_instance = None # alert we have a spawning process, but we don't have the instance yet, so give the class instead (more accurate than name) # Note: this uses a str as first argument instead of a process instance self._call_proc_state_changed("%s.%s" % (module, cls), ProcessStateEnum.PENDING) try: # Additional attributes to set with the process instance proc_attr = {"_proc_type": process_type, "_proc_spawn_cfg": config } # SPAWN. Determined by type if process_type == PROCTYPE_SERVICE: process_instance = self._spawn_service_process(process_id, name, module, cls, process_cfg, proc_attr) elif process_type == PROCTYPE_STREAMPROC: process_instance = self._spawn_stream_process(process_id, name, module, cls, process_cfg, proc_attr) elif process_type == PROCTYPE_AGENT: process_instance = self._spawn_agent_process(process_id, name, module, cls, process_cfg, proc_attr) elif process_type == PROCTYPE_STANDALONE: process_instance = self._spawn_standalone_process(process_id, name, module, cls, process_cfg, proc_attr) elif process_type == PROCTYPE_IMMEDIATE: process_instance = self._spawn_immediate_process(process_id, name, module, cls, process_cfg, proc_attr) elif process_type == PROCTYPE_SIMPLE: process_instance = self._spawn_simple_process(process_id, name, module, cls, process_cfg, proc_attr) else: raise BadRequest("Unknown process type: %s" % process_type) # REGISTER. self._register_process(process_instance, name) process_instance.errcause = "OK" log.info("ProcManager.spawn_process: %s.%s -> pid=%s OK", module, cls, process_id) if process_type == PROCTYPE_IMMEDIATE: log.debug('Terminating immediate process: %s', process_instance.id) self.terminate_process(process_instance.id) # Terminate process also triggers TERMINATING/TERMINATED self._call_proc_state_changed(process_instance, ProcessStateEnum.EXITED) else: # Update local policies for the new process if self.container.has_capability(self.container.CCAP.GOVERNANCE_CONTROLLER): self.container.governance_controller.update_process_policies( process_instance, safe_mode=True, force_update=False) return process_instance.id except IonProcessError: errcause = process_instance.errcause if process_instance else "instantiating process" log.exception("Error spawning %s %s process (process_id: %s): %s", name, process_type, process_id, errcause) return None except Exception: errcause = process_instance.errcause if process_instance else "instantiating process" log.exception("Error spawning %s %s process (process_id: %s): %s", name, process_type, process_id, errcause) # trigger failed notification - catches problems in init/start self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise def _create_process_config(self, config): """ Prepare the config for the new process. Clone system config and apply process overrides. Support including config by reference of a resource attribute or object from object store. """ process_cfg = deepcopy(CFG) if config: # Use provided config. Must be dict or DotDict if not isinstance(config, DotDict): config = DotDict(config) if config.get_safe("process.config_ref"): # Use a reference config_ref = config.get_safe("process.config_ref") log.info("Enhancing new process spawn config from ref=%s" % config_ref) matches = re.match(r'^([A-Za-z]+):([A-Za-z0-9_\.]+)/(.*)$', config_ref) if matches: ref_type, ref_id, ref_ext = matches.groups() if ref_type == "resources": if self.container.has_capability(self.container.CCAP.RESOURCE_REGISTRY): try: obj = self.container.resource_registry.read(ref_id) if obj and hasattr(obj, ref_ext): ref_config = getattr(obj, ref_ext) if isinstance(ref_config, dict): dict_merge(process_cfg, ref_config, inplace=True) else: raise BadRequest("config_ref %s exists but not dict" % config_ref) else: raise BadRequest("config_ref %s - attribute not found" % config_ref) except NotFound as nf: log.warn("config_ref %s - object not found" % config_ref) raise else: log.error("Container missing RESOURCE_REGISTRY capability to resolve process config ref %s" % config_ref) elif ref_type == "objects": if self.container.has_capability(self.container.CCAP.OBJECT_STORE): try: obj = self.container.object_store.read_doc(ref_id) ref_config = obj if ref_ext: ref_config = get_safe(obj, ref_ext, None) if ref_config is None: raise BadRequest("config_ref %s - attribute not found" % config_ref) if isinstance(ref_config, dict): dict_merge(process_cfg, ref_config, inplace=True) else: raise BadRequest("config_ref %s exists but not dict" % config_ref) except NotFound as nf: log.warn("config_ref %s - object not found" % config_ref) raise else: log.error("Container missing OBJECT_STORE capability to resolve process config ref %s" % config_ref) else: raise BadRequest("Unknown reference type in: %s" % config_ref) dict_merge(process_cfg, config, inplace=True) if self.container.spawn_args: # Override config with spawn args dict_merge(process_cfg, self.container.spawn_args, inplace=True) #log.debug("spawn_process() pid=%s process_cfg=%s", process_id, process_cfg) return process_cfg def list_local_processes(self, process_type=''): """ Returns a list of the running ION processes in the container or filtered by the process_type """ if not process_type: return self.procs.values() return [p for p in self.procs.itervalues() if p.process_type == process_type] def get_a_local_process(self, proc_name=''): """ Returns a running ION process in the container for the specified process name """ for p in self.procs.itervalues(): if p.name == proc_name: return p if p.process_type == PROCTYPE_AGENT and p.resource_type == proc_name: return p return None def get_local_service_processes(self, service_name=''): """ Returns a list of running ION processes in the container for the specified service name """ proc_list = [p for p in self.procs.itervalues() if p.process_type == PROCTYPE_SERVICE and p.name == service_name] return proc_list def is_local_service_process(self, service_name): local_services = self.list_local_processes(PROCTYPE_SERVICE) for p in local_services: if p.name == service_name: return True return False def is_local_agent_process(self, resource_type): local_agents = self.list_local_processes(PROCTYPE_AGENT) for p in local_agents: if p.resource_type == resource_type: return True return False def _spawned_proc_failed(self, gproc): log.error("ProcManager._spawned_proc_failed: %s, %s", gproc, gproc.exception) prc = self._spawned_proc_to_process.get(gproc, None) # stop the rest of the process if prc is not None: try: self.terminate_process(prc.id, False) except Exception as e: log.warn("Problem while stopping rest of failed process %s: %s", prc, e) finally: self._call_proc_state_changed(prc, ProcessStateEnum.FAILED) else: log.warn("No ION process found for failed proc manager child: %s", gproc) # Stop the container if this was the last process if not self.procs and CFG.get_safe("container.process.exit_once_empty", False): self.container.fail_fast("Terminating container after last process (%s) failed: %s" % (gproc, gproc.exception)) def add_proc_state_changed_callback(self, cb): """ Adds a callback to be called when a process' state changes. The callback should take three parameters: The process, the state, and the container. """ self._proc_state_change_callbacks.append(cb) def remove_proc_state_changed_callback(self, cb): """ Removes a callback from the process state change callback list. If the callback is not registered, this method does nothing. """ if cb in self._proc_state_change_callbacks: self._proc_state_change_callbacks.remove(cb) def _call_proc_state_changed(self, svc, state): """ Internal method to call all registered process state change callbacks. """ #log.debug("Proc State Changed (%s): %s", ProcessStateEnum._str_map.get(state, state), svc) for cb in self._proc_state_change_callbacks: cb(svc, state, self.container) # Trigger event if self.publish_events: self._publish_process_event(svc, state) def _create_listening_endpoint(self, **kwargs): """ Creates a listening endpoint for spawning processes. This method exists to be able to override the type created via configuration. In most cases it will create a ProcessRPCServer. """ eptypestr = CFG.get_safe('container.messaging.endpoint.proc_listening_type', None) if eptypestr is not None: module, cls = eptypestr.rsplit('.', 1) mod = __import__(module, fromlist=[cls]) eptype = getattr(mod, cls) ep = eptype(**kwargs) else: ep = ProcessRPCServer(**kwargs) return ep def _add_process_publishers(self, process_instance, config): # Add publishers if declared... publish_streams = get_safe(config, "process.publish_streams") pub_names = self._set_publisher_endpoints(process_instance, publish_streams) return pub_names # ----------------------------------------------------------------- # PROCESS TYPE: service # - has service listen binding/queue and RPC interface def _spawn_service_process(self, process_id, name, module, cls, config, proc_attr): """ Spawn a process acting as a service worker. Attach to service queue with service definition, attach to service pid """ process_instance = self._create_app_instance(process_id, name, module, cls, config, proc_attr) listen_name = get_safe(config, "process.listen_name") or process_instance.name listen_name_xo = self.container.create_service_xn(listen_name) log.debug("Service Process (%s) listen_name: %s", name, listen_name) process_instance._proc_listen_name = listen_name # Service RPC endpoint rsvc1 = self._create_listening_endpoint(node=self.container.node, from_name=listen_name_xo, process=process_instance) # Start an ION process with the right kind of endpoint factory proc = self.proc_sup.spawn(name=process_instance.id, service=process_instance, listeners=[rsvc1], proc_name=process_instance._proc_name) proc.proc._glname = "ION Proc %s" % process_instance._proc_name self.proc_sup.ensure_ready(proc, "_spawn_service_process for %s" % ",".join((str(listen_name), process_instance.id))) # map gproc to process_instance self._spawned_proc_to_process[proc.proc] = process_instance # set service's reference to process process_instance._process = proc self._process_init(process_instance) self._process_start(process_instance) try: proc.start_listeners() except IonProcessError: self._process_quit(process_instance) self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: stream process # - has stream listen binding/queue # - has publishers if declared def _spawn_stream_process(self, process_id, name, module, cls, config, proc_attr): """ Spawn a process acting as a data stream process. Attach to subscription queue with process function. """ process_instance = self._create_app_instance(process_id, name, module, cls, config, proc_attr) listeners = [] # Stream listener listen_name = get_safe(config, "process.listen_name") or name log.debug("Stream Process (%s) listen_name: %s", name, listen_name) process_instance._proc_listen_name = listen_name process_instance.stream_subscriber = StreamSubscriber(process=process_instance, exchange_name=listen_name, callback=process_instance.call_process) listeners.append(process_instance.stream_subscriber) pub_names = self._add_process_publishers(process_instance, config) # Private PID listener # pid_listener_xo = self.container.create_process_xn(process_instance.id) # rsvc = self._create_listening_endpoint(node=self.container.node, # from_name=pid_listener_xo, # process=process_instance) # listeners.append(rsvc) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) def cleanup(*args): for name in pub_names: p = getattr(process_instance, name) p.close() proc = self.proc_sup.spawn(name=process_instance.id, service=process_instance, listeners=listeners, proc_name=process_instance._proc_name, cleanup_method=cleanup) proc.proc._glname = "ION Proc %s" % process_instance._proc_name self.proc_sup.ensure_ready(proc, "_spawn_stream_process for %s" % process_instance._proc_name) # map gproc to process_instance self._spawned_proc_to_process[proc.proc] = process_instance # set service's reference to process process_instance._process = proc self._process_init(process_instance) self._process_start(process_instance) try: proc.start_listeners() except IonProcessError: self._process_quit(process_instance) self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: agent # - has resource ID (or if non-existent PID) listen binding/queue # - has RPC interface def _spawn_agent_process(self, process_id, name, module, cls, config, proc_attr): """ Spawn a process acting as agent process. Attach to service pid. """ process_instance = self._create_app_instance(process_id, name, module, cls, config, proc_attr) if not isinstance(process_instance, ResourceAgent): raise ContainerConfigError("Agent process must extend ResourceAgent") listeners = [] # Set the resource ID if we get it through the config resource_id = get_safe(process_instance.CFG, "agent.resource_id") if resource_id: process_instance.resource_id = resource_id # Resource ID listener resource_id_xo = self.container.create_process_xn(resource_id) alistener = self._create_listening_endpoint(node=self.container.node, from_name=resource_id_xo, process=process_instance) listeners.append(alistener) else: # Private PID listener pid_listener_xo = self.container.create_process_xn(process_instance.id) rsvc = self._create_listening_endpoint(node=self.container.node, from_name=pid_listener_xo, process=process_instance) listeners.append(rsvc) proc = self.proc_sup.spawn(name=process_instance.id, service=process_instance, listeners=listeners, proc_name=process_instance._proc_name) proc.proc._glname = "ION Proc %s" % process_instance._proc_name self.proc_sup.ensure_ready(proc, "_spawn_agent_process for %s" % process_instance.id) # map gproc to process_instance self._spawned_proc_to_process[proc.proc] = process_instance # set service's reference to process process_instance._process = proc # Now call the on_init of the agent. self._process_init(process_instance) if not process_instance.resource_id: log.warn("New agent pid=%s has no resource_id set" % process_id) self._process_start(process_instance) try: proc.start_listeners() except IonProcessError: self._process_quit(process_instance) self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise if not process_instance.resource_id: log.warn("Agent process id=%s does not define resource_id!!" % process_instance.id) return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: standalone # - has PID binding/queue with RPC interface # - has publishers if declared def _spawn_standalone_process(self, process_id, name, module, cls, config, proc_attr): """ Spawn a process acting as standalone process. Attach to service pid. """ process_instance = self._create_app_instance(process_id, name, module, cls, config, proc_attr) # Private PID listener pid_listener_xo = self.container.create_process_xn(process_instance.id, auto_delete=True) rsvc = self._create_listening_endpoint(node=self.container.node, from_name=pid_listener_xo, process=process_instance) pub_names = self._add_process_publishers(process_instance, config) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) def cleanup(*args): for name in pub_names: p = getattr(process_instance, name) p.close() proc = self.proc_sup.spawn(name=process_instance.id, service=process_instance, listeners=[rsvc], proc_name=process_instance._proc_name, cleanup_method=cleanup) proc.proc._glname = "ION Proc %s" % process_instance._proc_name self.proc_sup.ensure_ready(proc, "_spawn_standalone_process for %s" % process_instance.id) # map gproc to process_instance self._spawned_proc_to_process[proc.proc] = process_instance # set service's reference to process process_instance._process = proc self._process_init(process_instance) self._process_start(process_instance) try: proc.start_listeners() except IonProcessError: self._process_quit(process_instance) self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: simple # - has publishers if declared def _spawn_simple_process(self, process_id, name, module, cls, config, proc_attr): """ Spawn a process acting as simple process. No attachments. """ process_instance = self._create_app_instance(process_id, name, module, cls, config, proc_attr) pub_names = self._add_process_publishers(process_instance, config) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) def cleanup(*args): for name in pub_names: p = getattr(process_instance, name) p.close() proc = self.proc_sup.spawn(name=process_instance.id, service=process_instance, listeners=[], proc_name=process_instance._proc_name, cleanup_method=cleanup) proc.proc._glname = "ION Proc %s" % process_instance._proc_name self.proc_sup.ensure_ready(proc, "_spawn_simple_process for %s" % process_instance.id) # map gproc to process_instance self._spawned_proc_to_process[proc.proc] = process_instance # set service's reference to process process_instance._process = proc self._process_init(process_instance) self._process_start(process_instance) return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: immediate # - will not be registered # - will be terminated right after start def _spawn_immediate_process(self, process_id, name, module, cls, config, proc_attr): """ Spawn a process acting as immediate one off process. No messaging attachments. """ process_instance = self._create_app_instance(process_id, name, module, cls, config, proc_attr) self._process_init(process_instance) self._process_start(process_instance) return process_instance # ----------------------------------------------------------------- def _create_app_instance(self, process_id, name, module, cls, config, proc_attr): """ Creates an instance of a BaseService, representing the app logic of a ION process. This is independent of the process type service, agent, standalone, etc. """ # APP INSTANCE. app_instance = for_name(module, cls) if not isinstance(app_instance, BaseService): raise ContainerConfigError("Instantiated service not a BaseService %r" % app_instance) # Set BaseService instance common attributes app_instance.errcause = "" app_instance.id = process_id app_instance.container = self.container app_instance.CFG = config app_instance._proc_name = name app_instance._proc_start_time = time.time() for att, att_val in proc_attr.iteritems(): setattr(app_instance, att, att_val) # Unless the process has been started as part of another Org, default to the container Org or the ION Org if 'org_governance_name' in config: app_instance.org_governance_name = config['org_governance_name'] else: app_instance.org_governance_name = CFG.get_safe('container.org_name', CFG.get_safe('system.root_org', 'ION')) # Add process state management, if applicable self._add_process_state(app_instance) # Check dependencies (RPC clients) self._check_process_dependencies(app_instance) return app_instance def _add_process_state(self, process_instance): """ Add stateful process operations, if applicable """ # Only applies if the process implements stateful interface if hasattr(process_instance, "_flush_state"): def _flush_state(): with process_instance._state_lock: state_obj = process_instance.container.state_repository.put_state(process_instance.id, process_instance._proc_state, state_obj=process_instance._proc_state_obj) state_obj.state = None # Make sure memory footprint is low for larger states process_instance._proc_state_obj = state_obj process_instance._proc_state_changed = False def _load_state(): if not hasattr(process_instance, "_proc_state"): process_instance._proc_state = {} try: with process_instance._state_lock: new_state, state_obj = process_instance.container.state_repository.get_state(process_instance.id) process_instance._proc_state.clear() process_instance._proc_state.update(new_state) process_instance._proc_state_obj = state_obj process_instance._proc_state_changed = False except NotFound as nf: log.debug("No persisted state available for process %s", process_instance.id) except Exception as ex: log.warn("Process %s load state failed: %s", process_instance.id, str(ex)) process_instance._flush_state = _flush_state process_instance._load_state = _load_state process_instance._state_lock = RLock() process_instance._proc_state = {} process_instance._proc_state_obj = None process_instance._proc_state_changed = False # PROCESS RESTART: Need to check whether this process had persisted state. # Note: This could happen anytime during a system run, not just on RESTART boot log.debug("Loading persisted state for process %s", process_instance.id) process_instance._load_state() def _check_process_dependencies(self, app_instance): app_instance.errcause = "setting service dependencies" log.debug("spawn_process dependencies: %s", app_instance.dependencies) # TODO: Service dependency != process dependency for dependency in app_instance.dependencies: client = getattr(app_instance.clients, dependency) assert client, "Client for dependency not found: %s" % dependency # @TODO: should be in a start_client in RPCClient chain client.process = app_instance client.node = self.container.node # Ensure that dep actually exists and is running? def _process_init(self, process_instance): """ Initialize the process, primarily by calling on_init() """ process_instance.errcause = "initializing service" process_instance.init() def _process_start(self, process_instance): """ Start the process, primarily by calling on_start() """ # Should this be after spawn_process? # Should we check for timeout? process_instance.errcause = "starting service" process_instance.start() def _process_quit(self, process_instance): """ Common method to handle process stopping. """ process_instance.errcause = "quitting process" # Give the process notice to quit doing stuff. process_instance.quit() # Terminate IonProcessThread (may not have one, i.e. simple process) # @TODO: move this into process' on_quit() if getattr(process_instance, '_process', None) is not None and process_instance._process: process_instance._process.notify_stop() process_instance._process.stop() def _set_publisher_endpoints(self, process_instance, publisher_streams=None): """ Creates and attaches named stream publishers """ publisher_streams = publisher_streams or {} names = [] for name, stream_id in publisher_streams.iteritems(): # problem is here pub = StreamPublisher(process=process_instance, stream_id=stream_id) setattr(process_instance, name, pub) names.append(name) return names def _register_process(self, process_instance, name): """ Performs all actions related to registering the new process in the system. Also performs process type specific registration, such as for services and agents """ # Add process instance to container's process dict if name in self.procs_by_name: log.warn("Process name already registered in container: %s" % name) self.procs_by_name[name] = process_instance self.procs[process_instance.id] = process_instance # Add Process to resource registry process_instance.errcause = "registering" if process_instance._proc_type != PROCTYPE_IMMEDIATE: if self.container.has_capability(self.container.CCAP.RESOURCE_REGISTRY): proc_obj = Process(name=process_instance.id, label=name, process_type=process_instance._proc_type, service_name=getattr(process_instance, "name", None) or "", process_state=ProcessStateEnum.RUNNING) proc_id, _ = self.container.resource_registry.create(proc_obj) process_instance._proc_res_id = proc_id # Associate process with container resource self.container.resource_registry.create_association(self.cc_id, PRED.hasProcess, proc_id) else: process_instance._proc_res_id = None # Process type specific registration if process_instance._proc_type == PROCTYPE_SERVICE: if self.container.has_capability(self.container.CCAP.RESOURCE_REGISTRY): # Registration of SERVICE process: in resource registry service_list, _ = self.container.resource_registry.find_resources( restype=RT.Service, name=process_instance.name, id_only=True) if service_list: process_instance._proc_svc_id = service_list[0] if len(service_list) > 1: log.warn("More than 1 Service resource found with name %s: %s", process_instance.name, service_list) else: # We are starting the first process of a service instance # TODO: This should be created by the HA Service agent in the future svc_obj = Service(name=process_instance.name, exchange_name=process_instance._proc_listen_name, state=ServiceStateEnum.READY) process_instance._proc_svc_id, _ = self.container.resource_registry.create(svc_obj) # Create association to service definition resource svcdef_list, _ = self.container.resource_registry.find_resources( restype=RT.ServiceDefinition, name=process_instance.name, id_only=True) if svcdef_list: if len(svcdef_list) > 1: log.warn("More than 1 ServiceDefinition resource found with name %s: %s", process_instance.name, svcdef_list) self.container.resource_registry.create_association( process_instance._proc_svc_id, PRED.hasServiceDefinition, svcdef_list[0]) else: log.error("Cannot find ServiceDefinition resource for %s", process_instance.name) self.container.resource_registry.create_association( process_instance._proc_svc_id, PRED.hasProcess, proc_id) elif process_instance._proc_type == PROCTYPE_AGENT: if self.container.has_capability(self.container.CCAP.DIRECTORY): # Registration of AGENT process: in Directory caps = process_instance.get_capabilities() self.container.directory.register("/Agents", process_instance.id, **dict(name=process_instance._proc_name, container=process_instance.container.id, resource_id=process_instance.resource_id, agent_id=process_instance.agent_id, def_id=process_instance.agent_def_id, capabilities=caps)) self._call_proc_state_changed(process_instance, ProcessStateEnum.RUNNING) def terminate_process(self, process_id, do_notifications=True): """ Terminates a process and all its resources. Termination is graceful with timeout. @param process_id The id of the process to terminate. Should exist in the container's list of processes or this will raise. @param do_notifications If True, emits process state changes for TERMINATING and TERMINATED. If False, supresses any state changes. Used near EXITED and FAILED. """ process_instance = self.procs.get(process_id, None) if not process_instance: raise BadRequest("Cannot terminate. Process id='%s' unknown on container id='%s'" % ( process_id, self.container.id)) log.info("ProcManager.terminate_process: %s -> pid=%s", process_instance._proc_name, process_id) if do_notifications: self._call_proc_state_changed(process_instance, ProcessStateEnum.TERMINATING) self._process_quit(process_instance) self._unregister_process(process_id, process_instance) if do_notifications: self._call_proc_state_changed(process_instance, ProcessStateEnum.TERMINATED) def _unregister_process(self, process_id, process_instance): # Remove process registration in resource registry if process_instance._proc_res_id: if self.container.has_capability(self.container.CCAP.RESOURCE_REGISTRY): try: self.container.resource_registry.delete(process_instance._proc_res_id, del_associations=True) except NotFound: # OK if already gone pass except Exception as ex: log.exception(ex) pass # Cleanup for specific process types if process_instance._proc_type == PROCTYPE_SERVICE: if self.container.has_capability(self.container.CCAP.RESOURCE_REGISTRY): # Check if this is the last process for this service and do auto delete service resources here svcproc_list, _ = self.container.resource_registry.find_objects( process_instance._proc_svc_id, PRED.hasProcess, RT.Process, id_only=True) if not svcproc_list: try: self.container.resource_registry.delete(process_instance._proc_svc_id, del_associations=True) except NotFound: # OK if already gone pass except Exception as ex: log.exception(ex) pass elif process_instance._proc_type == PROCTYPE_AGENT: if self.container.has_capability(self.container.CCAP.DIRECTORY): self.container.directory.unregister_safe("/Agents", process_instance.id) # Remove internal registration in container del self.procs[process_id] if process_instance._proc_name in self.procs_by_name: del self.procs_by_name[process_instance._proc_name] else: log.warn("Process name %s not in local registry", process_instance.name) def _publish_process_event(self, proc_inst, state): sub_type = ProcessStateEnum._str_map.get(state, state) if isinstance(proc_inst, basestring): # self.event_pub.publish_event(event_type=OT.ProcessLifecycleEvent, # origin=proc_inst, origin_type=RT.Process, sub_type=sub_type, # state=state, # container_id=self.container.id, # process_type="", process_name=proc_inst, # process_resource_id="", service_name="") # This is a PENDING process without process_id pass else: try: self.event_pub.publish_event(event_type=OT.ProcessLifecycleEvent, origin=getattr(proc_inst, "id", "PD"), origin_type=RT.Process, sub_type=sub_type, state=state, container_id=self.container.id, process_type=getattr(proc_inst, "_proc_type", ""), process_name=getattr(proc_inst, "_proc_name", ""), process_resource_id=getattr(proc_inst, "_proc_res_id", ""), service_name=getattr(proc_inst, "name", "")) except Exception: log.exception("Could not publish process event") # ----------------------------------------------------------------- def _start_process_dispatcher(self): from ion.core.process.pd_core import ProcessDispatcher self.pd_core = ProcessDispatcher(container=self.container, config=self.pd_cfg) self.pd_core.start() def _stop_process_dispatcher(self): if self.pd_core: self.pd_core.stop()
class ProcManager(object): def __init__(self, container): self.container = container # Define the callables that can be added to Container public API self.container_api = [self.spawn_process, self.terminate_process] # Add the public callables to Container for call in self.container_api: setattr(self.container, call.__name__, call) self.proc_id_pool = IDPool() # Temporary registry of running processes self.procs_by_name = {} self.procs = {} # mapping of greenlets we spawn to process_instances for error handling self._spawned_proc_to_process = {} # The pyon worker process supervisor self.proc_sup = IonProcessThreadManager( heartbeat_secs=CFG.cc.timeout.heartbeat, failure_notify_callback=self._spawned_proc_failed) # list of callbacks for process state changes self._proc_state_change_callbacks = [] def start(self): log.debug("ProcManager starting ...") self.proc_sup.start() # Register container as resource object cc_obj = CapabilityContainer(name=self.container.id, cc_agent=self.container.name) self.cc_id, _ = self.container.resource_registry.create(cc_obj) #Create an association to an Org object if not the rot ION org and only if found if CFG.container.org_name and CFG.container.org_name != CFG.system.root_org: org, _ = self.container.resource_registry.find_resources( restype=RT.Org, name=CFG.container.org_name, id_only=True) if org: self.container.resource_registry.create_association( org[0], PRED.hasResource, self.cc_id) # TODO - replace with proper association log.debug("ProcManager started, OK.") def stop(self): log.debug("ProcManager stopping ...") from pyon.datastore.couchdb.couchdb_datastore import CouchDB_DataStore stats1 = CouchDB_DataStore._stats.get_stats() # Call quit on procs to give them ability to clean up # @TODO terminate_process is not gl-safe # gls = map(lambda k: spawn(self.terminate_process, k), self.procs.keys()) # join(gls) procs_list = sorted(self.procs.values(), key=lambda proc: proc._proc_start_time, reverse=True) for proc in procs_list: try: self.terminate_process(proc.id) except Exception as ex: log.warn("Failed to terminate process (%s): %s", proc.id, ex) # TODO: Have a choice of shutdown behaviors for waiting on children, timeouts, etc self.proc_sup.shutdown(CFG.cc.timeout.shutdown) if self.procs: log.warn("ProcManager procs not empty: %s", self.procs) if self.procs_by_name: log.warn("ProcManager procs_by_name not empty: %s", self.procs_by_name) # Remove Resource registration try: self.container.resource_registry.delete(self.cc_id, del_associations=True) except NotFound: # already gone, this is ok pass # TODO: Check associations to processes stats2 = CouchDB_DataStore._stats.get_stats() stats3 = CouchDB_DataStore._stats.diff_stats(stats2, stats1) log.debug("Datastore stats difference during stop(): %s", stats3) log.debug("ProcManager stopped, OK.") def spawn_process(self, name=None, module=None, cls=None, config=None, process_id=None): """ Spawn a process within the container. Processes can be of different type. """ if process_id and not is_valid_identifier(process_id, ws_sub='_'): raise BadRequest("Given process_id %s is not a valid identifier" % process_id) # Generate a new process id if not provided # TODO: Ensure it is system-wide unique process_id = process_id or "%s.%s" % (self.container.id, self.proc_id_pool.get_id()) log.debug( "ProcManager.spawn_process(name=%s, module.cls=%s.%s, config=%s) as pid=%s", name, module, cls, config, process_id) process_cfg = deepcopy(CFG) if config: # Use provided config. Must be dict or DotDict if not isinstance(config, DotDict): config = DotDict(config) dict_merge(process_cfg, config, inplace=True) if self.container.spawn_args: # Override config with spawn args dict_merge(process_cfg, self.container.spawn_args, inplace=True) #log.debug("spawn_process() pid=%s process_cfg=%s", process_id, process_cfg) # PROCESS TYPE. Determines basic process context (messaging, service interface) # One of: service, stream_process, agent, simple, immediate service_cls = named_any("%s.%s" % (module, cls)) process_type = get_safe(process_cfg, "process.type") or getattr( service_cls, "process_type", "service") process_start_mode = get_safe(config, "process.start_mode") process_instance = None # alert we have a spawning process, but we don't have the instance yet, so give the class instead (more accurate than name) self._call_proc_state_changed("%s.%s" % (module, cls), ProcessStateEnum.PENDING) try: # spawn service by type if process_type == "service": process_instance = self._spawn_service_process( process_id, name, module, cls, process_cfg) elif process_type == "stream_process": process_instance = self._spawn_stream_process( process_id, name, module, cls, process_cfg) elif process_type == "agent": process_instance = self._spawn_agent_process( process_id, name, module, cls, process_cfg) elif process_type == "standalone": process_instance = self._spawn_standalone_process( process_id, name, module, cls, process_cfg) elif process_type == "immediate": process_instance = self._spawn_immediate_process( process_id, name, module, cls, process_cfg) elif process_type == "simple": process_instance = self._spawn_simple_process( process_id, name, module, cls, process_cfg) else: raise BadRequest("Unknown process type: %s" % process_type) process_instance._proc_type = process_type self._register_process(process_instance, name) process_instance.errcause = "OK" log.info("ProcManager.spawn_process: %s.%s -> pid=%s OK", module, cls, process_id) if process_type == 'immediate': log.info('Terminating immediate process: %s', process_instance.id) self.terminate_process(process_instance.id) # terminate process also triggers TERMINATING/TERMINATED self._call_proc_state_changed(process_instance, ProcessStateEnum.EXITED) else: #Shouldn't be any policies for immediate processes self.update_container_policies(process_instance) return process_instance.id except IonProcessError: errcause = process_instance.errcause if process_instance else "instantiating process" log.exception("Error spawning %s %s process (process_id: %s): %s", name, process_type, process_id, errcause) return None except Exception: errcause = process_instance.errcause if process_instance else "instantiating process" log.exception("Error spawning %s %s process (process_id: %s): %s", name, process_type, process_id, errcause) # trigger failed notification - catches problems in init/start self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise #This must be called after registering the process def update_container_policies(self, process_instance): if not self.container.governance_controller: return if process_instance._proc_type == "service": # look to load any existing policies for this service self.container.governance_controller.safe_update_service_access_policy( process_instance._proc_listen_name) if process_instance._proc_type == "agent": # look to load any existing policies for this agent service if process_instance.resource_type is None: self.container.governance_controller.safe_update_service_access_policy( process_instance.name) else: self.container.governance_controller.safe_update_service_access_policy( process_instance.resource_type) if process_instance.resource_id: # look to load any existing policies for this resource self.container.governance_controller.safe_update_resource_access_policy( process_instance.resource_id) def list_local_processes(self, process_type=''): """ Returns a list of the running ION processes in the container or filtered by the process_type """ if not process_type: return self.procs.values() return [ p for p in self.procs.itervalues() if p.process_type == process_type ] def get_a_local_process(self, proc_name=''): """ Returns a running ION processes in the container for the specified name """ for p in self.procs.itervalues(): if p.name == proc_name: return p if p.process_type == 'agent' and p.resource_type == proc_name: return p return None def is_local_service_process(self, service_name): local_services = self.list_local_processes('service') for p in local_services: if p.name == service_name: return True return False def is_local_agent_process(self, resource_type): local_agents = self.list_local_processes('agent') for p in local_agents: if p.resource_type == resource_type: return True return False def _spawned_proc_failed(self, gproc): log.error("ProcManager._spawned_proc_failed: %s, %s", gproc, gproc.exception) # for now - don't worry about the mapping, if we get a failure, just kill the container. # leave the mapping in place for potential expansion later. # # look it up in mapping # if not gproc in self._spawned_proc_to_process: # log.warn("No record of gproc %s in our map (%s)", gproc, self._spawned_proc_to_process) # return # prc = self._spawned_proc_to_process.get(gproc, None) # # # make sure prc is in our list # if not prc in self.procs.values(): # log.warn("prc %s not found in procs list", prc) # return # stop the rest of the process if prc is not None: try: self.terminate_process(prc.id, False) except Exception as e: log.warn( "Problem while stopping rest of failed process %s: %s", prc, e) finally: self._call_proc_state_changed(prc, ProcessStateEnum.FAILED) else: log.warn("No ION process found for failed proc manager child: %s", gproc) #self.container.fail_fast("Container process (%s) failed: %s" % (svc, gproc.exception)) def _cleanup_method(self, queue_name, ep=None): """ Common method to be passed to each spawned ION process to clean up their process-queue. @TODO Leaks implementation detail, should be using XOs """ if ep._chan is not None and not ep._chan._queue_auto_delete: # only need to delete if AMQP didn't handle it for us already! # @TODO this will not work with XOs (future) try: ch = self.container.node.channel(RecvChannel) ch._recv_name = NameTrio( get_sys_name(), "%s.%s" % (get_sys_name(), queue_name)) ch._destroy_queue() except TransportError as ex: log.warn("Cleanup method triggered an error, ignoring: %s", ex) def add_proc_state_changed_callback(self, cb): """ Adds a callback to be called when a process' state changes. The callback should take three parameters: The process, the state, and the container. """ self._proc_state_change_callbacks.append(cb) def remove_proc_state_changed_callback(self, cb): """ Removes a callback from the process state change callback list. If the callback is not registered, this method does nothing. """ if cb in self._proc_state_change_callbacks: self._proc_state_change_callbacks.remove(cb) def _call_proc_state_changed(self, svc, state): """ Internal method to call all registered process state change callbacks. """ log.debug("Proc State Changed (%s): %s", ProcessStateEnum._str_map.get(state, state), svc) for cb in self._proc_state_change_callbacks: cb(svc, state, self.container) def _create_listening_endpoint(self, **kwargs): """ Creates a listening endpoint for spawning processes. This method exists to be able to override the type created via configuration. In most cases it will create a ConversationRPCServer. """ eptypestr = CFG.get_safe( 'container.messaging.endpoint.proc_listening_type', None) if eptypestr is not None: module, cls = eptypestr.rsplit('.', 1) mod = __import__(module, fromlist=[cls]) eptype = getattr(mod, cls) ep = eptype(**kwargs) else: conv_enabled = CFG.get_safe( 'container.messaging.endpoint.rpc_conversation_enabled', False) if conv_enabled: ep = ConversationRPCServer(**kwargs) else: ep = ProcessRPCServer(**kwargs) return ep # ----------------------------------------------------------------- # PROCESS TYPE: service def _spawn_service_process(self, process_id, name, module, cls, config): """ Spawn a process acting as a service worker. Attach to service queue with service definition, attach to service pid """ process_instance = self._create_process_instance( process_id, name, module, cls, config) listen_name = get_safe(config, "process.listen_name") or process_instance.name log.debug("Service Process (%s) listen_name: %s", name, listen_name) process_instance._proc_listen_name = listen_name # Service RPC endpoint rsvc1 = self._create_listening_endpoint(node=self.container.node, from_name=listen_name, service=process_instance, process=process_instance) # Named local RPC endpoint rsvc2 = self._create_listening_endpoint(node=self.container.node, from_name=process_instance.id, service=process_instance, process=process_instance) # cleanup method to delete process queue cleanup = lambda _: self._cleanup_method(process_instance.id, rsvc2) # Start an ION process with the right kind of endpoint factory proc = self.proc_sup.spawn(name=process_instance.id, service=process_instance, listeners=[rsvc1, rsvc2], proc_name=process_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready( proc, "_spawn_service_process for %s" % ",".join( (listen_name, process_instance.id))) # map gproc to process_instance self._spawned_proc_to_process[proc.proc] = process_instance # set service's reference to process process_instance._process = proc self._process_init(process_instance) self._process_start(process_instance) try: proc.start_listeners() except IonProcessError: self._process_quit(process_instance) self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: stream process def _spawn_stream_process(self, process_id, name, module, cls, config): """ Spawn a process acting as a data stream process. Attach to subscription queue with process function. """ process_instance = self._create_process_instance( process_id, name, module, cls, config) listen_name = get_safe(config, "process.listen_name") or name log.debug("Stream Process (%s) listen_name: %s", name, listen_name) process_instance._proc_listen_name = listen_name process_instance.stream_subscriber = StreamSubscriber( process=process_instance, exchange_name=listen_name, callback=process_instance.call_process) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") pub_names = self._set_publisher_endpoints(process_instance, publish_streams) rsvc = self._create_listening_endpoint(node=self.container.node, from_name=process_instance.id, service=process_instance, process=process_instance) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) def cleanup(*args): self._cleanup_method(process_instance.id, rsvc) for name in pub_names: p = getattr(process_instance, name) p.close() proc = self.proc_sup.spawn( name=process_instance.id, service=process_instance, listeners=[rsvc, process_instance.stream_subscriber], proc_name=process_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready( proc, "_spawn_stream_process for %s" % process_instance._proc_name) # map gproc to process_instance self._spawned_proc_to_process[proc.proc] = process_instance # set service's reference to process process_instance._process = proc self._process_init(process_instance) self._process_start(process_instance) try: proc.start_listeners() except IonProcessError: self._process_quit(process_instance) self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: agent def _spawn_agent_process(self, process_id, name, module, cls, config): """ Spawn a process acting as agent process. Attach to service pid. """ process_instance = self._create_process_instance( process_id, name, module, cls, config) if not isinstance(process_instance, ResourceAgent) and not isinstance( process_instance, SimpleResourceAgent): raise ContainerConfigError( "Agent process must extend ResourceAgent") listeners = [] # Set the resource ID if we get it through the config resource_id = get_safe(process_instance.CFG, "agent.resource_id") if resource_id: process_instance.resource_id = resource_id alistener = self._create_listening_endpoint( node=self.container.node, from_name=resource_id, service=process_instance, process=process_instance) listeners.append(alistener) rsvc = self._create_listening_endpoint(node=self.container.node, from_name=process_instance.id, service=process_instance, process=process_instance) listeners.append(rsvc) # cleanup method to delete process/agent queue (@TODO: leaks a bit here - should use XOs) def agent_cleanup(x): self._cleanup_method(process_instance.id, rsvc) if resource_id: pass #self._cleanup_method(resource_id, alistener) # disabled, it's probably not architecturally correct to delete this queue proc = self.proc_sup.spawn(name=process_instance.id, service=process_instance, listeners=listeners, proc_name=process_instance._proc_name, cleanup_method=agent_cleanup) self.proc_sup.ensure_ready( proc, "_spawn_agent_process for %s" % process_instance.id) # map gproc to process_instance self._spawned_proc_to_process[proc.proc] = process_instance # set service's reference to process process_instance._process = proc # Now call the on_init of the agent. self._process_init(process_instance) if not process_instance.resource_id: log.warn("New agent pid=%s has no resource_id set" % process_id) self._process_start(process_instance) try: proc.start_listeners() except IonProcessError: self._process_quit(process_instance) self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise if not process_instance.resource_id: log.warn("Agent process id=%s does not define resource_id!!" % process_instance.id) return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: standalone def _spawn_standalone_process(self, process_id, name, module, cls, config): """ Spawn a process acting as standalone process. Attach to service pid. """ process_instance = self._create_process_instance( process_id, name, module, cls, config) rsvc = self._create_listening_endpoint(node=self.container.node, from_name=process_instance.id, service=process_instance, process=process_instance) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") pub_names = self._set_publisher_endpoints(process_instance, publish_streams) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) def cleanup(*args): self._cleanup_method(process_instance.id, rsvc) for name in pub_names: p = getattr(process_instance, name) p.close() proc = self.proc_sup.spawn(name=process_instance.id, service=process_instance, listeners=[rsvc], proc_name=process_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready( proc, "_spawn_standalone_process for %s" % process_instance.id) # map gproc to process_instance self._spawned_proc_to_process[proc.proc] = process_instance # set service's reference to process process_instance._process = proc self._process_init(process_instance) self._process_start(process_instance) try: proc.start_listeners() except IonProcessError: self._process_quit(process_instance) self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: simple def _spawn_simple_process(self, process_id, name, module, cls, config): """ Spawn a process acting as simple process. No attachments. """ process_instance = self._create_process_instance( process_id, name, module, cls, config) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") pub_names = self._set_publisher_endpoints(process_instance, publish_streams) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) def cleanup(*args): for name in pub_names: p = getattr(process_instance, name) p.close() proc = self.proc_sup.spawn(name=process_instance.id, service=process_instance, listeners=[], proc_name=process_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready( proc, "_spawn_simple_process for %s" % process_instance.id) self._process_init(process_instance) self._process_start(process_instance) return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: immediate def _spawn_immediate_process(self, process_id, name, module, cls, config): """ Spawn a process acting as immediate one off process. No attachments. """ process_instance = self._create_process_instance( process_id, name, module, cls, config) self._process_init(process_instance) self._process_start(process_instance) return process_instance def _create_process_instance(self, process_id, name, module, cls, config): """ Creates an instance of a "service", be it a Service, Agent, Stream, etc. @rtype BaseService @return An instance of a "service" """ # SERVICE INSTANCE. process_instance = for_name(module, cls) if not isinstance(process_instance, BaseService): raise ContainerConfigError( "Instantiated service not a BaseService %r" % process_instance) # Prepare service instance process_instance.errcause = "" process_instance.id = process_id process_instance.container = self.container process_instance.CFG = config process_instance._proc_name = name process_instance._proc_start_time = time.time() #Unless the process has been started as part of another Org, default to the container Org or the ION Org if config.has_key('org_name'): process_instance.org_name = config['org_name'] else: process_instance.org_name = CFG.get_safe( 'container.org_name', CFG.get_safe('system.root_org', 'ION')) # Add stateful process operations if hasattr(process_instance, "_flush_state"): def _flush_state(): if not hasattr(process_instance, "_proc_state"): process_instance._proc_state = {} process_instance._proc_state_changed = False return process_instance.container.state_repository.put_state( process_instance.id, process_instance._proc_state) process_instance._proc_state_changed = False def _load_state(): if not hasattr(process_instance, "_proc_state"): process_instance._proc_state = {} try: new_state = process_instance.container.state_repository.get_state( process_instance.id) process_instance._proc_state.clear() process_instance._proc_state.update(new_state) process_instance._proc_state_changed = False except Exception as ex: log.warn("Process %s load state failed: %s", process_instance.id, str(ex)) process_instance._flush_state = _flush_state process_instance._load_state = _load_state process_start_mode = get_safe(config, "process.start_mode") if process_start_mode == "RESTART": if hasattr(process_instance, "_load_state"): process_instance._load_state() # start service dependencies (RPC clients) self._start_process_dependencies(process_instance) return process_instance def _start_process_dependencies(self, process_instance): process_instance.errcause = "setting service dependencies" log.debug("spawn_process dependencies: %s", process_instance.dependencies) # TODO: Service dependency != process dependency for dependency in process_instance.dependencies: client = getattr(process_instance.clients, dependency) assert client, "Client for dependency not found: %s" % dependency # @TODO: should be in a start_client in RPCClient chain client.process = process_instance client.node = self.container.node # ensure that dep actually exists and is running # MM: commented out - during startup (init actually), we don't need to check for service dependencies # MM: TODO: split on_init from on_start; start consumer in on_start; check for full queues on restart # if process_instance.name != 'bootstrap' or (process_instance.name == 'bootstrap' and process_instance.CFG.level == dependency): # svc_de = self.container.resource_registry.find_resources(restype="Service", name=dependency, id_only=True) # if not svc_de: # raise ContainerConfigError("Dependency for service %s not running: %s" % (process_instance.name, dependency)) def _process_init(self, process_instance): # Init process process_instance.errcause = "initializing service" process_instance.init() def _process_start(self, process_instance): # Start process # THIS SHOULD BE CALLED LATER THAN SPAWN # TODO: Check for timeout process_instance.errcause = "starting service" process_instance.start() def _process_quit(self, process_instance): """ Common method to handle process stopping. """ process_instance.errcause = "quitting process" # Give the process notice to quit doing stuff. process_instance.quit() # Terminate IonProcessThread (may not have one, i.e. simple process) # @TODO: move this into process' on_quit() if getattr(process_instance, '_process', None) is not None and process_instance._process: process_instance._process.notify_stop() process_instance._process.stop() def _set_publisher_endpoints(self, process_instance, publisher_streams=None): publisher_streams = publisher_streams or {} names = [] for name, stream_id in publisher_streams.iteritems(): # problem is here pub = StreamPublisher(process=process_instance, stream_id=stream_id) setattr(process_instance, name, pub) names.append(name) return names def _register_process(self, process_instance, name): """ Performs all actions related to registering the new process in the system. Also performs process type specific registration, such as for services and agents """ # Add process instance to container's process dict if name in self.procs_by_name: log.warn("Process name already registered in container: %s" % name) self.procs_by_name[name] = process_instance self.procs[process_instance.id] = process_instance # Add Process to resource registry # Note: In general the Process resource should be created by the CEI PD, but not all processes are CEI # processes. How to deal with this? process_instance.errcause = "registering" if process_instance._proc_type != "immediate": proc_obj = Process(name=process_instance.id, label=name, proctype=process_instance._proc_type) proc_id, _ = self.container.resource_registry.create(proc_obj) process_instance._proc_res_id = proc_id # Associate process with container resource self.container.resource_registry.create_association( self.cc_id, "hasProcess", proc_id) else: process_instance._proc_res_id = None # Process type specific registration # TODO: Factor out into type specific handler functions if process_instance._proc_type == "service": # Registration of SERVICE process: in resource registry service_list, _ = self.container.resource_registry.find_resources( restype="Service", name=process_instance.name) if service_list: process_instance._proc_svc_id = service_list[0]._id else: # We are starting the first process of a service instance # TODO: This should be created by the HA Service agent in the future svc_obj = Service( name=process_instance.name, exchange_name=process_instance._proc_listen_name, state=ServiceStateEnum.READY) process_instance._proc_svc_id, _ = self.container.resource_registry.create( svc_obj) # Create association to service definition resource svcdef_list, _ = self.container.resource_registry.find_resources( restype="ServiceDefinition", name=process_instance.name) if svcdef_list: self.container.resource_registry.create_association( process_instance._proc_svc_id, "hasServiceDefinition", svcdef_list[0]._id) else: log.error("Cannot find ServiceDefinition resource for %s", process_instance.name) self.container.resource_registry.create_association( process_instance._proc_svc_id, "hasProcess", proc_id) elif process_instance._proc_type == "agent": # Registration of AGENT process: in Directory caps = process_instance.get_capabilities() self.container.directory.register( "/Agents", process_instance.id, **dict(name=process_instance._proc_name, container=process_instance.container.id, resource_id=process_instance.resource_id, agent_id=process_instance.agent_id, def_id=process_instance.agent_def_id, capabilities=caps)) self._call_proc_state_changed(process_instance, ProcessStateEnum.RUNNING) def terminate_process(self, process_id, do_notifications=True): """ Terminates a process and all its resources. Termination is graceful with timeout. @param process_id The id of the process to terminate. Should exist in the container's list of processes or this will raise. @param do_notifications If True, emits process state changes for TERMINATING and TERMINATED. If False, supresses any state changes. Used near EXITED and FAILED. """ process_instance = self.procs.get(process_id, None) if not process_instance: raise BadRequest( "Cannot terminate. Process id='%s' unknown on container id='%s'" % (process_id, self.container.id)) log.info("ProcManager.terminate_process: %s -> pid=%s", process_instance._proc_name, process_id) if do_notifications: self._call_proc_state_changed(process_instance, ProcessStateEnum.TERMINATING) self._process_quit(process_instance) self._unregister_process(process_id, process_instance) if do_notifications: self._call_proc_state_changed(process_instance, ProcessStateEnum.TERMINATED) def _unregister_process(self, process_id, process_instance): # Remove process registration in resource registry if process_instance._proc_res_id: try: self.container.resource_registry.delete( process_instance._proc_res_id, del_associations=True) except NotFound: #, HTTPException): # if it's already gone, it's already gone! pass except Exception, ex: log.exception(ex) pass # Cleanup for specific process types if process_instance._proc_type == "service": # Check if this is the last process for this service and do auto delete service resources here svcproc_list = [] try: svcproc_list, _ = self.container.resource_registry.find_objects( process_instance._proc_svc_id, "hasProcess", "Process", id_only=True) except ResourceNotFound: # if it's already gone, it's already gone! pass if not svcproc_list: try: self.container.resource_registry.delete( process_instance._proc_svc_id, del_associations=True) except NotFound: # if it's already gone, it's already gone! pass except Exception, ex: log.exception(ex) pass
class ProcManager(object): def __init__(self, container): self.container = container # Define the callables that can be added to Container public API self.container_api = [self.spawn_process, self.terminate_process] # Add the public callables to Container for call in self.container_api: setattr(self.container, call.__name__, call) self.proc_id_pool = IDPool() # Temporary registry of running processes self.procs_by_name = {} self.procs = {} # mapping of greenlets we spawn to process_instances for error handling self._spawned_proc_to_process = {} # The pyon worker process supervisor self.proc_sup = IonProcessThreadManager(heartbeat_secs=CFG.cc.timeout.heartbeat, failure_notify_callback=self._spawned_proc_failed) # list of callbacks for process state changes self._proc_state_change_callbacks = [] def start(self): log.debug("ProcManager starting ...") self.proc_sup.start() if self.container.has_capability(self.container.CCAP.RESOURCE_REGISTRY): # Register container as resource object cc_obj = CapabilityContainer(name=self.container.id, cc_agent=self.container.name) self.cc_id, _ = self.container.resource_registry.create(cc_obj) #Create an association to an Org object if not the rot ION org and only if found if CFG.container.org_name and CFG.container.org_name != CFG.system.root_org: org, _ = self.container.resource_registry.find_resources(restype=RT.Org, name=CFG.container.org_name, id_only=True) if org: self.container.resource_registry.create_association(org[0], PRED.hasResource, self.cc_id) # TODO - replace with proper association log.debug("ProcManager started, OK.") def stop(self): log.debug("ProcManager stopping ...") # Call quit on procs to give them ability to clean up # @TODO terminate_process is not gl-safe # gls = map(lambda k: spawn(self.terminate_process, k), self.procs.keys()) # join(gls) procs_list = sorted(self.procs.values(), key=lambda proc: proc._proc_start_time, reverse=True) for proc in procs_list: try: self.terminate_process(proc.id) except Exception as ex: log.warn("Failed to terminate process (%s): %s", proc.id, ex) # TODO: Have a choice of shutdown behaviors for waiting on children, timeouts, etc self.proc_sup.shutdown(CFG.cc.timeout.shutdown) if self.procs: log.warn("ProcManager procs not empty: %s", self.procs) if self.procs_by_name: log.warn("ProcManager procs_by_name not empty: %s", self.procs_by_name) # Remove Resource registration if self.container.has_capability(self.container.CCAP.RESOURCE_REGISTRY): try: self.container.resource_registry.delete(self.cc_id, del_associations=True) except NotFound: # already gone, this is ok pass # TODO: Check associations to processes log.debug("ProcManager stopped, OK.") def spawn_process(self, name=None, module=None, cls=None, config=None, process_id=None): """ Spawn a process within the container. Processes can be of different type. """ if process_id and not is_valid_identifier(process_id, ws_sub='_'): raise BadRequest("Given process_id %s is not a valid identifier" % process_id) # Generate a new process id if not provided # TODO: Ensure it is system-wide unique process_id = process_id or "%s.%s" % (self.container.id, self.proc_id_pool.get_id()) log.debug("ProcManager.spawn_process(name=%s, module.cls=%s.%s, config=%s) as pid=%s", name, module, cls, config, process_id) process_cfg = deepcopy(CFG) if config: # Use provided config. Must be dict or DotDict if not isinstance(config, DotDict): config = DotDict(config) if config.get_safe("process.config_ref"): # Use a reference config_ref = config.get_safe("process.config_ref") log.info("Enhancing new process spawn config from ref=%s" % config_ref) matches = re.match(r'^([A-Za-z]+):([A-Za-z0-9]+)/(.+)$', config_ref) if matches: ref_type, ref_id, ref_ext = matches.groups() if ref_type == "resources": if self.container.has_capability(self.container.CCAP.RESOURCE_REGISTRY): try: obj = self.container.resource_registry.read(ref_id) if obj and hasattr(obj, ref_ext): ref_config = getattr(obj, ref_ext) if isinstance(ref_config, dict): dict_merge(process_cfg, ref_config, inplace=True) else: raise BadRequest("config_ref %s exists but not dict" % config_ref) else: raise BadRequest("config_ref %s - attribute not found" % config_ref) except NotFound as nf: log.warn("config_ref %s - object not found" % config_ref) raise else: log.error("Container missing RESOURCE_REGISTRY capability to resolve process config ref %s" % config_ref) else: raise BadRequest("Unknown reference type in: %s" % config_ref) dict_merge(process_cfg, config, inplace=True) if self.container.spawn_args: # Override config with spawn args dict_merge(process_cfg, self.container.spawn_args, inplace=True) #log.debug("spawn_process() pid=%s process_cfg=%s", process_id, process_cfg) # PROCESS TYPE. Determines basic process context (messaging, service interface) # One of the constants defined at the top of this file service_cls = named_any("%s.%s" % (module, cls)) process_type = get_safe(process_cfg, "process.type") or getattr(service_cls, "process_type", "service") process_start_mode = get_safe(config, "process.start_mode") process_instance = None # alert we have a spawning process, but we don't have the instance yet, so give the class instead (more accurate than name) self._call_proc_state_changed("%s.%s" % (module, cls), ProcessStateEnum.PENDING) try: # spawn service by type if process_type == SERVICE_PROCESS_TYPE: process_instance = self._spawn_service_process(process_id, name, module, cls, process_cfg) elif process_type == STREAM_PROCESS_TYPE: process_instance = self._spawn_stream_process(process_id, name, module, cls, process_cfg) elif process_type == AGENT_PROCESS_TYPE: process_instance = self._spawn_agent_process(process_id, name, module, cls, process_cfg) elif process_type == STANDALONE_PROCESS_TYPE: process_instance = self._spawn_standalone_process(process_id, name, module, cls, process_cfg) elif process_type == IMMEDIATE_PROCESS_TYPE: process_instance = self._spawn_immediate_process(process_id, name, module, cls, process_cfg) elif process_type == SIMPLE_PROCESS_TYPE: process_instance = self._spawn_simple_process(process_id, name, module, cls, process_cfg) else: raise BadRequest("Unknown process type: %s" % process_type) process_instance._proc_type = process_type self._register_process(process_instance, name) process_instance.errcause = "OK" log.info("ProcManager.spawn_process: %s.%s -> pid=%s OK", module, cls, process_id) if process_type == IMMEDIATE_PROCESS_TYPE: log.info('Terminating immediate process: %s', process_instance.id) self.terminate_process(process_instance.id) # terminate process also triggers TERMINATING/TERMINATED self._call_proc_state_changed(process_instance, ProcessStateEnum.EXITED) else: #Update local policies for the new process if self.container.has_capability(self.container.CCAP.GOVERNANCE_CONTROLLER): self.container.governance_controller.update_container_policies(process_instance, safe_mode=True) return process_instance.id except IonProcessError: errcause = process_instance.errcause if process_instance else "instantiating process" log.exception("Error spawning %s %s process (process_id: %s): %s", name, process_type, process_id, errcause) return None except Exception: errcause = process_instance.errcause if process_instance else "instantiating process" log.exception("Error spawning %s %s process (process_id: %s): %s", name, process_type, process_id, errcause) # trigger failed notification - catches problems in init/start self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise def list_local_processes(self, process_type=''): """ Returns a list of the running ION processes in the container or filtered by the process_type """ if not process_type: return self.procs.values() return [p for p in self.procs.itervalues() if p.process_type == process_type] def get_a_local_process(self, proc_name=''): """ Returns a running ION processes in the container for the specified name """ for p in self.procs.itervalues(): if p.name == proc_name: return p if p.process_type == AGENT_PROCESS_TYPE and p.resource_type == proc_name: return p return None def is_local_service_process(self, service_name): local_services = self.list_local_processes(SERVICE_PROCESS_TYPE) for p in local_services: if p.name == service_name: return True return False def is_local_agent_process(self, resource_type): local_agents = self.list_local_processes(AGENT_PROCESS_TYPE) for p in local_agents: if p.resource_type == resource_type: return True return False def _spawned_proc_failed(self, gproc): log.error("ProcManager._spawned_proc_failed: %s, %s", gproc, gproc.exception) # for now - don't worry about the mapping, if we get a failure, just kill the container. # leave the mapping in place for potential expansion later. # # look it up in mapping # if not gproc in self._spawned_proc_to_process: # log.warn("No record of gproc %s in our map (%s)", gproc, self._spawned_proc_to_process) # return # prc = self._spawned_proc_to_process.get(gproc, None) # # # make sure prc is in our list # if not prc in self.procs.values(): # log.warn("prc %s not found in procs list", prc) # return # stop the rest of the process if prc is not None: try: self.terminate_process(prc.id, False) except Exception as e: log.warn("Problem while stopping rest of failed process %s: %s", prc, e) finally: self._call_proc_state_changed(prc, ProcessStateEnum.FAILED) else: log.warn("No ION process found for failed proc manager child: %s", gproc) #self.container.fail_fast("Container process (%s) failed: %s" % (svc, gproc.exception)) # Stop the container if this was the last process if not self.procs and CFG.get_safe("container.processes.exit_once_empty", False): self.container.fail_fast("Terminating container after last process (%s) failed: %s" % (gproc, gproc.exception)) def _cleanup_method(self, queue_name, ep=None): """ Common method to be passed to each spawned ION process to clean up their process-queue. @TODO Leaks implementation detail, should be using XOs """ if ep._chan is not None and not ep._chan._queue_auto_delete: # only need to delete if AMQP didn't handle it for us already! # @TODO this will not work with XOs (future) try: ch = self.container.node.channel(RecvChannel) ch._recv_name = NameTrio(get_sys_name(), "%s.%s" % (get_sys_name(), queue_name)) ch._destroy_queue() except TransportError as ex: log.warn("Cleanup method triggered an error, ignoring: %s", ex) def add_proc_state_changed_callback(self, cb): """ Adds a callback to be called when a process' state changes. The callback should take three parameters: The process, the state, and the container. """ self._proc_state_change_callbacks.append(cb) def remove_proc_state_changed_callback(self, cb): """ Removes a callback from the process state change callback list. If the callback is not registered, this method does nothing. """ if cb in self._proc_state_change_callbacks: self._proc_state_change_callbacks.remove(cb) def _call_proc_state_changed(self, svc, state): """ Internal method to call all registered process state change callbacks. """ log.debug("Proc State Changed (%s): %s", ProcessStateEnum._str_map.get(state, state), svc) for cb in self._proc_state_change_callbacks: cb(svc, state, self.container) def _create_listening_endpoint(self, **kwargs): """ Creates a listening endpoint for spawning processes. This method exists to be able to override the type created via configuration. In most cases it will create a ConversationRPCServer. """ eptypestr = CFG.get_safe('container.messaging.endpoint.proc_listening_type', None) if eptypestr is not None: module, cls = eptypestr.rsplit('.', 1) mod = __import__(module, fromlist=[cls]) eptype = getattr(mod, cls) ep = eptype(**kwargs) else: conv_enabled = CFG.get_safe('container.messaging.endpoint.rpc_conversation_enabled', False) if conv_enabled: ep = ConversationRPCServer(**kwargs) else: ep = ProcessRPCServer(**kwargs) return ep # ----------------------------------------------------------------- # PROCESS TYPE: service def _spawn_service_process(self, process_id, name, module, cls, config): """ Spawn a process acting as a service worker. Attach to service queue with service definition, attach to service pid """ process_instance = self._create_process_instance(process_id, name, module, cls, config) listen_name = get_safe(config, "process.listen_name") or process_instance.name log.debug("Service Process (%s) listen_name: %s", name, listen_name) process_instance._proc_listen_name = listen_name # Service RPC endpoint rsvc1 = self._create_listening_endpoint(node=self.container.node, from_name=listen_name, process=process_instance) # Named local RPC endpoint rsvc2 = self._create_listening_endpoint(node=self.container.node, from_name=process_instance.id, process=process_instance) # cleanup method to delete process queue cleanup = lambda _: self._cleanup_method(process_instance.id, rsvc2) # Start an ION process with the right kind of endpoint factory proc = self.proc_sup.spawn(name=process_instance.id, service=process_instance, listeners=[rsvc1, rsvc2], proc_name=process_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready(proc, "_spawn_service_process for %s" % ",".join((listen_name, process_instance.id))) # map gproc to process_instance self._spawned_proc_to_process[proc.proc] = process_instance # set service's reference to process process_instance._process = proc self._process_init(process_instance) self._process_start(process_instance) try: proc.start_listeners() except IonProcessError: self._process_quit(process_instance) self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: stream process def _spawn_stream_process(self, process_id, name, module, cls, config): """ Spawn a process acting as a data stream process. Attach to subscription queue with process function. """ process_instance = self._create_process_instance(process_id, name, module, cls, config) listen_name = get_safe(config, "process.listen_name") or name log.debug("Stream Process (%s) listen_name: %s", name, listen_name) process_instance._proc_listen_name = listen_name process_instance.stream_subscriber = StreamSubscriber(process=process_instance, exchange_name=listen_name, callback=process_instance.call_process) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") pub_names = self._set_publisher_endpoints(process_instance, publish_streams) rsvc = self._create_listening_endpoint(node=self.container.node, from_name=process_instance.id, process=process_instance) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) def cleanup(*args): self._cleanup_method(process_instance.id, rsvc) for name in pub_names: p = getattr(process_instance, name) p.close() proc = self.proc_sup.spawn(name=process_instance.id, service=process_instance, listeners=[rsvc, process_instance.stream_subscriber], proc_name=process_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready(proc, "_spawn_stream_process for %s" % process_instance._proc_name) # map gproc to process_instance self._spawned_proc_to_process[proc.proc] = process_instance # set service's reference to process process_instance._process = proc self._process_init(process_instance) self._process_start(process_instance) try: proc.start_listeners() except IonProcessError: self._process_quit(process_instance) self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: agent def _spawn_agent_process(self, process_id, name, module, cls, config): """ Spawn a process acting as agent process. Attach to service pid. """ process_instance = self._create_process_instance(process_id, name, module, cls, config) if not isinstance(process_instance, ResourceAgent) and not isinstance(process_instance, SimpleResourceAgent): raise ContainerConfigError("Agent process must extend ResourceAgent") listeners = [] # Set the resource ID if we get it through the config resource_id = get_safe(process_instance.CFG, "agent.resource_id") if resource_id: process_instance.resource_id = resource_id alistener = self._create_listening_endpoint(node=self.container.node, from_name=resource_id, process=process_instance) listeners.append(alistener) rsvc = self._create_listening_endpoint(node=self.container.node, from_name=process_instance.id, process=process_instance) listeners.append(rsvc) # cleanup method to delete process/agent queue (@TODO: leaks a bit here - should use XOs) def agent_cleanup(x): self._cleanup_method(process_instance.id, rsvc) if resource_id: self._cleanup_method(resource_id, alistener) proc = self.proc_sup.spawn(name=process_instance.id, service=process_instance, listeners=listeners, proc_name=process_instance._proc_name, cleanup_method=agent_cleanup) self.proc_sup.ensure_ready(proc, "_spawn_agent_process for %s" % process_instance.id) # map gproc to process_instance self._spawned_proc_to_process[proc.proc] = process_instance # set service's reference to process process_instance._process = proc # Now call the on_init of the agent. self._process_init(process_instance) if not process_instance.resource_id: log.warn("New agent pid=%s has no resource_id set" % process_id) self._process_start(process_instance) try: proc.start_listeners() except IonProcessError: self._process_quit(process_instance) self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise if not process_instance.resource_id: log.warn("Agent process id=%s does not define resource_id!!" % process_instance.id) return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: standalone def _spawn_standalone_process(self, process_id, name, module, cls, config): """ Spawn a process acting as standalone process. Attach to service pid. """ process_instance = self._create_process_instance(process_id, name, module, cls, config) rsvc = self._create_listening_endpoint(node=self.container.node, from_name=process_instance.id, process=process_instance) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") pub_names = self._set_publisher_endpoints(process_instance, publish_streams) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) def cleanup(*args): self._cleanup_method(process_instance.id, rsvc) for name in pub_names: p = getattr(process_instance, name) p.close() proc = self.proc_sup.spawn(name=process_instance.id, service=process_instance, listeners=[rsvc], proc_name=process_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready(proc, "_spawn_standalone_process for %s" % process_instance.id) # map gproc to process_instance self._spawned_proc_to_process[proc.proc] = process_instance # set service's reference to process process_instance._process = proc self._process_init(process_instance) self._process_start(process_instance) try: proc.start_listeners() except IonProcessError: self._process_quit(process_instance) self._call_proc_state_changed(process_instance, ProcessStateEnum.FAILED) raise return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: simple def _spawn_simple_process(self, process_id, name, module, cls, config): """ Spawn a process acting as simple process. No attachments. """ process_instance = self._create_process_instance(process_id, name, module, cls, config) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") pub_names = self._set_publisher_endpoints(process_instance, publish_streams) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) def cleanup(*args): for name in pub_names: p = getattr(process_instance, name) p.close() proc = self.proc_sup.spawn(name=process_instance.id, service=process_instance, listeners=[], proc_name=process_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready(proc, "_spawn_simple_process for %s" % process_instance.id) # map gproc to process_instance self._spawned_proc_to_process[proc.proc] = process_instance # set service's reference to process process_instance._process = proc self._process_init(process_instance) self._process_start(process_instance) return process_instance # ----------------------------------------------------------------- # PROCESS TYPE: immediate def _spawn_immediate_process(self, process_id, name, module, cls, config): """ Spawn a process acting as immediate one off process. No attachments. """ process_instance = self._create_process_instance(process_id, name, module, cls, config) self._process_init(process_instance) self._process_start(process_instance) return process_instance def _create_process_instance(self, process_id, name, module, cls, config): """ Creates an instance of a "service", be it a Service, Agent, Stream, etc. @rtype BaseService @return An instance of a "service" """ # SERVICE INSTANCE. process_instance = for_name(module, cls) if not isinstance(process_instance, BaseService): raise ContainerConfigError("Instantiated service not a BaseService %r" % process_instance) # Prepare service instance process_instance.errcause = "" process_instance.id = process_id process_instance.container = self.container process_instance.CFG = config process_instance._proc_name = name process_instance._proc_start_time = time.time() #Unless the process has been started as part of another Org, default to the container Org or the ION Org if config.has_key('org_governance_name'): process_instance.org_governance_name = config['org_governance_name'] else: process_instance.org_governance_name = CFG.get_safe('container.org_name', CFG.get_safe('system.root_org', 'ION')) # Add stateful process operations if hasattr(process_instance, "_flush_state"): def _flush_state(): with process_instance._state_lock: state_obj = process_instance.container.state_repository.put_state(process_instance.id, process_instance._proc_state, state_obj=process_instance._proc_state_obj) state_obj.state = None # Make sure memory footprint is low for larger states process_instance._proc_state_obj = state_obj process_instance._proc_state_changed = False def _load_state(): if not hasattr(process_instance, "_proc_state"): process_instance._proc_state = {} try: with process_instance._state_lock: new_state, state_obj = process_instance.container.state_repository.get_state(process_instance.id) process_instance._proc_state.clear() process_instance._proc_state.update(new_state) process_instance._proc_state_obj = state_obj process_instance._proc_state_changed = False except NotFound as nf: log.debug("No persisted state available for process %s", process_instance.id) except Exception as ex: log.warn("Process %s load state failed: %s", process_instance.id, str(ex)) process_instance._flush_state = _flush_state process_instance._load_state = _load_state process_instance._state_lock = RLock() process_instance._proc_state = {} process_instance._proc_state_obj = None process_instance._proc_state_changed = False # PROCESS RESTART: Need to check whether this process had persisted state. # Note: This could happen anytime during a system run, not just on RESTART boot log.debug("Loading persisted state for process %s", process_id) process_instance._load_state() # start service dependencies (RPC clients) self._start_process_dependencies(process_instance) return process_instance def _start_process_dependencies(self, process_instance): process_instance.errcause = "setting service dependencies" log.debug("spawn_process dependencies: %s", process_instance.dependencies) # TODO: Service dependency != process dependency for dependency in process_instance.dependencies: client = getattr(process_instance.clients, dependency) assert client, "Client for dependency not found: %s" % dependency # @TODO: should be in a start_client in RPCClient chain client.process = process_instance client.node = self.container.node # ensure that dep actually exists and is running # MM: commented out - during startup (init actually), we don't need to check for service dependencies # MM: TODO: split on_init from on_start; start consumer in on_start; check for full queues on restart # if process_instance.name != 'bootstrap' or (process_instance.name == 'bootstrap' and process_instance.CFG.level == dependency): # svc_de = self.container.resource_registry.find_resources(restype="Service", name=dependency, id_only=True) # if not svc_de: # raise ContainerConfigError("Dependency for service %s not running: %s" % (process_instance.name, dependency)) def _process_init(self, process_instance): # Init process process_instance.errcause = "initializing service" process_instance.init() def _process_start(self, process_instance): # Start process # THIS SHOULD BE CALLED LATER THAN SPAWN # TODO: Check for timeout process_instance.errcause = "starting service" process_instance.start() def _process_quit(self, process_instance): """ Common method to handle process stopping. """ process_instance.errcause = "quitting process" # Give the process notice to quit doing stuff. process_instance.quit() # Terminate IonProcessThread (may not have one, i.e. simple process) # @TODO: move this into process' on_quit() if getattr(process_instance, '_process', None) is not None and process_instance._process: process_instance._process.notify_stop() process_instance._process.stop() def _set_publisher_endpoints(self, process_instance, publisher_streams=None): publisher_streams = publisher_streams or {} names = [] for name, stream_id in publisher_streams.iteritems(): # problem is here pub = StreamPublisher(process=process_instance, stream_id=stream_id) setattr(process_instance, name, pub) names.append(name) return names def _register_process(self, process_instance, name): """ Performs all actions related to registering the new process in the system. Also performs process type specific registration, such as for services and agents """ # Add process instance to container's process dict if name in self.procs_by_name: log.warn("Process name already registered in container: %s" % name) self.procs_by_name[name] = process_instance self.procs[process_instance.id] = process_instance # Add Process to resource registry # Note: In general the Process resource should be created by the CEI PD, but not all processes are CEI # processes. How to deal with this? process_instance.errcause = "registering" if process_instance._proc_type != IMMEDIATE_PROCESS_TYPE: if self.container.has_capability(self.container.CCAP.RESOURCE_REGISTRY): proc_obj = Process(name=process_instance.id, label=name, proctype=process_instance._proc_type) proc_id, _ = self.container.resource_registry.create(proc_obj) process_instance._proc_res_id = proc_id # Associate process with container resource self.container.resource_registry.create_association(self.cc_id, "hasProcess", proc_id) else: process_instance._proc_res_id = None # Process type specific registration # TODO: Factor out into type specific handler functions if process_instance._proc_type == SERVICE_PROCESS_TYPE: if self.container.has_capability(self.container.CCAP.RESOURCE_REGISTRY): # Registration of SERVICE process: in resource registry service_list, _ = self.container.resource_registry.find_resources(restype="Service", name=process_instance.name, id_only=True) if service_list: process_instance._proc_svc_id = service_list[0] if len(service_list) > 1: log.warn("More than 1 Service resource found with name %s: %s", process_instance.name, service_list) else: # We are starting the first process of a service instance # TODO: This should be created by the HA Service agent in the future svc_obj = Service(name=process_instance.name, exchange_name=process_instance._proc_listen_name, state=ServiceStateEnum.READY) process_instance._proc_svc_id, _ = self.container.resource_registry.create(svc_obj) # Create association to service definition resource svcdef_list, _ = self.container.resource_registry.find_resources(restype="ServiceDefinition", name=process_instance.name, id_only=True) if svcdef_list: if len(svcdef_list) > 1: log.warn("More than 1 ServiceDefinition resource found with name %s: %s", process_instance.name, svcdef_list) self.container.resource_registry.create_association(process_instance._proc_svc_id, "hasServiceDefinition", svcdef_list[0]) else: log.error("Cannot find ServiceDefinition resource for %s", process_instance.name) self.container.resource_registry.create_association(process_instance._proc_svc_id, "hasProcess", proc_id) elif process_instance._proc_type == AGENT_PROCESS_TYPE: if self.container.has_capability(self.container.CCAP.DIRECTORY): # Registration of AGENT process: in Directory caps = process_instance.get_capabilities() self.container.directory.register("/Agents", process_instance.id, **dict(name=process_instance._proc_name, container=process_instance.container.id, resource_id=process_instance.resource_id, agent_id=process_instance.agent_id, def_id=process_instance.agent_def_id, capabilities=caps)) self._call_proc_state_changed(process_instance, ProcessStateEnum.RUNNING) def terminate_process(self, process_id, do_notifications=True): """ Terminates a process and all its resources. Termination is graceful with timeout. @param process_id The id of the process to terminate. Should exist in the container's list of processes or this will raise. @param do_notifications If True, emits process state changes for TERMINATING and TERMINATED. If False, supresses any state changes. Used near EXITED and FAILED. """ process_instance = self.procs.get(process_id, None) if not process_instance: raise BadRequest("Cannot terminate. Process id='%s' unknown on container id='%s'" % ( process_id, self.container.id)) log.info("ProcManager.terminate_process: %s -> pid=%s", process_instance._proc_name, process_id) if do_notifications: self._call_proc_state_changed(process_instance, ProcessStateEnum.TERMINATING) self._process_quit(process_instance) self._unregister_process(process_id, process_instance) if do_notifications: self._call_proc_state_changed(process_instance, ProcessStateEnum.TERMINATED) def _unregister_process(self, process_id, process_instance): # Remove process registration in resource registry if process_instance._proc_res_id: if self.container.has_capability(self.container.CCAP.RESOURCE_REGISTRY): try: self.container.resource_registry.delete(process_instance._proc_res_id, del_associations=True) except NotFound: #, HTTPException): # if it's already gone, it's already gone! pass except Exception, ex: log.exception(ex) pass # Cleanup for specific process types if process_instance._proc_type == SERVICE_PROCESS_TYPE: if self.container.has_capability(self.container.CCAP.RESOURCE_REGISTRY): # Check if this is the last process for this service and do auto delete service resources here svcproc_list = [] try: svcproc_list, _ = self.container.resource_registry.find_objects(process_instance._proc_svc_id, "hasProcess", "Process", id_only=True) except ResourceNotFound: # if it's already gone, it's already gone! pass if not svcproc_list: try: self.container.resource_registry.delete(process_instance._proc_svc_id, del_associations=True) except NotFound: # if it's already gone, it's already gone! pass except Exception, ex: log.exception(ex) pass
class ProcManager(object): def __init__(self, container): self.container = container # Define the callables that can be added to Container public API self.container_api = [self.spawn_process, self.terminate_process] # Add the public callables to Container for call in self.container_api: setattr(self.container, call.__name__, call) self.proc_id_pool = IDPool() # Temporary registry of running processes self.procs_by_name = {} self.procs = {} # mapping of greenlets we spawn to service_instances for error handling self._spawned_proc_to_process = {} # The pyon worker process supervisor self.proc_sup = IonProcessThreadManager(heartbeat_secs=CFG.cc.timeout.heartbeat, failure_notify_callback=self._spawned_proc_failed) def start(self): log.debug("ProcManager starting ...") self.proc_sup.start() log.debug("ProcManager started, OK.") def stop(self): log.debug("ProcManager stopping ...") # Call quit on procs to give them ability to clean up # @TODO terminate_process is not gl-safe # gls = map(lambda k: spawn(self.terminate_process, k), self.procs.keys()) # join(gls) map(self.terminate_process, self.procs.keys()) # TODO: Have a choice of shutdown behaviors for waiting on children, timeouts, etc self.proc_sup.shutdown(CFG.cc.timeout.shutdown) log.debug("ProcManager stopped, OK.") def spawn_process(self, name=None, module=None, cls=None, config=None, process_id=None): """ Spawn a process within the container. Processes can be of different type. """ if process_id and not is_valid_identifier(process_id, ws_sub='_'): raise BadRequest("Given process_id %s is not a valid identifier" % process_id) # Generate a new process id if not provided # TODO: Ensure it is system-wide unique process_id = process_id or "%s.%s" % (self.container.id, self.proc_id_pool.get_id()) log.debug("ProcManager.spawn_process(name=%s, module.cls=%s.%s) as pid=%s", name, module, cls, process_id) if not config: # Use system CFG. It has the command line args in it config = DictModifier(CFG) else: # Use provided config. Must be dict or DotDict if not isinstance(config, DotDict): config = DotDict(config) config = DictModifier(CFG, config) if self.container.spawn_args: # Override config with spawn args dict_merge(config, self.container.spawn_args, inplace=True) #log.debug("spawn_process() pid=%s config=%s", process_id, config) # PROCESS TYPE. Determines basic process context (messaging, service interface) # One of: service, stream_process, agent, simple, immediate service_cls = named_any("%s.%s" % (module, cls)) process_type = get_safe(config, "process.type") or getattr(service_cls, "process_type", "service") service_instance = None try: # spawn service by type if process_type == "service": service_instance = self._spawn_service_process(process_id, name, module, cls, config) elif process_type == "stream_process": service_instance = self._spawn_stream_process(process_id, name, module, cls, config) elif process_type == "agent": service_instance = self._spawn_agent_process(process_id, name, module, cls, config) elif process_type == "standalone": service_instance = self._spawn_standalone_process(process_id, name, module, cls, config) elif process_type == "immediate": service_instance = self._spawn_immediate_process(process_id, name, module, cls, config) elif process_type == "simple": service_instance = self._spawn_simple_process(process_id, name, module, cls, config) else: raise BadRequest("Unknown process type: %s" % process_type) service_instance._proc_type = process_type self._register_process(service_instance, name) service_instance.errcause = "OK" log.info("AppManager.spawn_process: %s.%s -> pid=%s OK", module, cls, process_id) if process_type == 'immediate': log.info('Terminating immediate process: %s', service_instance.id) self.terminate_process(service_instance.id) return service_instance.id except Exception: errcause = service_instance.errcause if service_instance else "instantiating service" log.exception("Error spawning %s %s process (process_id: %s): %s", name, process_type, process_id, errcause) raise def _spawned_proc_failed(self, gproc): log.error("ProcManager._spawned_proc_failed: %s", gproc) # for now - don't worry about the mapping, if we get a failure, just kill the container. # leave the mapping in place for potential expansion later. # # look it up in mapping # if not gproc in self._spawned_proc_to_process: # log.warn("No record of gproc %s in our map (%s)", gproc, self._spawned_proc_to_process) # return # svc = self._spawned_proc_to_process.get(gproc, "Unknown") # # # make sure svc is in our list # if not svc in self.procs.values(): # log.warn("svc %s not found in procs list", svc) # return self.container.fail_fast("Container process (%s) failed: %s" % (svc, gproc.exception)) # ----------------------------------------------------------------- # PROCESS TYPE: service def _spawn_service_process(self, process_id, name, module, cls, config): """ Spawn a process acting as a service worker. Attach to service queue with service definition, attach to service pid """ service_instance = self._create_service_instance(process_id, name, module, cls, config) self._service_init(service_instance) self._service_start(service_instance) listen_name = get_safe(config, "process.listen_name") or service_instance.name log.debug("Service Process (%s) listen_name: %s", name, listen_name) # Service RPC endpoint rsvc1 = ProcessRPCServer(node=self.container.node, from_name=listen_name, service=service_instance, process=service_instance) # Named local RPC endpoint rsvc2 = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) # Start an ION process with the right kind of endpoint factory proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc1, rsvc2], proc_name=service_instance._proc_name) self.proc_sup.ensure_ready(proc, "_spawn_service_process for %s" % ",".join((listen_name, service_instance.id))) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc # Directory registration self.container.directory.register_safe("/Services", listen_name, interface=service_instance.name) self.container.directory.register_safe("/Services/%s" % listen_name, service_instance.id) return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: stream process def _spawn_stream_process(self, process_id, name, module, cls, config): """ Spawn a process acting as a data stream process. Attach to subscription queue with process function. """ service_instance = self._create_service_instance(process_id, name, module, cls, config) self._service_init(service_instance) # Start the service self._service_start(service_instance) listen_name = get_safe(config, "process.listen_name") or name service_instance.stream_subscriber_registrar = StreamSubscriberRegistrar(process=service_instance, node=self.container.node) sub = service_instance.stream_subscriber_registrar.create_subscriber(exchange_name=listen_name) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") self._set_publisher_endpoints(service_instance, publish_streams) rsvc = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc, sub], proc_name=service_instance._proc_name) self.proc_sup.ensure_ready(proc, "_spawn_stream_process for %s" % service_instance._proc_name) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: agent def _spawn_agent_process(self, process_id, name, module, cls, config): """ Spawn a process acting as agent process. Attach to service pid. """ service_instance = self._create_service_instance(process_id, name, module, cls, config) if not isinstance(service_instance, ResourceAgent): raise ContainerConfigError("Agent process must extend ResourceAgent") # Set the resource ID if we get it through the config resource_id = get_safe(service_instance.CFG, "agent.resource_id") if resource_id: service_instance.resource_id = resource_id # Now call the on_init of the agent. self._service_init(service_instance) if not service_instance.resource_id: log.warn("New agent pid=%s has no resource_id set" % process_id) self._service_start(service_instance) rsvc = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc], proc_name=service_instance._proc_name) self.proc_sup.ensure_ready(proc, "_spawn_agent_process for %s" % service_instance.id) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc # Directory registration caps = service_instance.get_capabilities() self.container.directory.register("/Agents", service_instance.id, **dict(name=service_instance._proc_name, container=service_instance.container.id, resource_id=service_instance.resource_id, agent_id=service_instance.agent_id, def_id=service_instance.agent_def_id, capabilities=caps)) if not service_instance.resource_id: log.warn("Agent process id=%s does not define resource_id!!" % service_instance.id) return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: standalone def _spawn_standalone_process(self, process_id, name, module, cls, config): """ Spawn a process acting as standalone process. Attach to service pid. """ service_instance = self._create_service_instance(process_id, name, module, cls, config) self._service_init(service_instance) self._service_start(service_instance) rsvc = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc], proc_name=service_instance._proc_name) self.proc_sup.ensure_ready(proc, "_spawn_standalone_process for %s" % service_instance.id) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") self._set_publisher_endpoints(service_instance, publish_streams) return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: simple def _spawn_simple_process(self, process_id, name, module, cls, config): """ Spawn a process acting as simple process. No attachments. """ service_instance = self._create_service_instance(process_id, name, module, cls, config) self._service_init(service_instance) self._service_start(service_instance) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") self._set_publisher_endpoints(service_instance, publish_streams) return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: immediate def _spawn_immediate_process(self, process_id, name, module, cls, config): """ Spawn a process acting as immediate one off process. No attachments. """ service_instance = self._create_service_instance(process_id, name, module, cls, config) self._service_init(service_instance) self._service_start(service_instance) return service_instance def _create_service_instance(self, process_id, name, module, cls, config): """ Creates an instance of a "service", be it a Service, Agent, Stream, etc. @rtype BaseService @return An instance of a "service" """ # SERVICE INSTANCE. service_instance = for_name(module, cls) if not isinstance(service_instance, BaseService): raise ContainerConfigError("Instantiated service not a BaseService %r" % service_instance) # Prepare service instance service_instance.errcause = "" service_instance.id = process_id service_instance.container = self.container service_instance.CFG = config service_instance._proc_name = name # start service dependencies (RPC clients) self._start_service_dependencies(service_instance) return service_instance def _start_service_dependencies(self, service_instance): service_instance.errcause = "setting service dependencies" log.debug("spawn_process dependencies: %s", service_instance.dependencies) # TODO: Service dependency != process dependency for dependency in service_instance.dependencies: client = getattr(service_instance.clients, dependency) assert client, "Client for dependency not found: %s" % dependency # @TODO: should be in a start_client in RPCClient chain client.process = service_instance client.node = self.container.node # ensure that dep actually exists and is running if service_instance.name != 'bootstrap' or (service_instance.name == 'bootstrap' and service_instance.CFG.level == dependency): svc_de = self.container.directory.lookup("/Services/%s" % dependency) if svc_de is None: raise ContainerConfigError("Dependency for service %s not running: %s" % (service_instance.name, dependency)) def _service_init(self, service_instance): # Init process service_instance.errcause = "initializing service" service_instance.init() def _service_start(self, service_instance): # Start process # TODO: Check for timeout service_instance.errcause = "starting service" service_instance.start() def _set_publisher_endpoints(self, service_instance, publisher_streams=None): service_instance.stream_publisher_registrar = StreamPublisherRegistrar(process=service_instance, node=self.container.node) publisher_streams = publisher_streams or {} for name, stream_id in publisher_streams.iteritems(): # problem is here pub = service_instance.stream_publisher_registrar.create_publisher(stream_id) setattr(service_instance, name, pub) def _register_process(self, service_instance, name): # Add to local process dict self.procs_by_name[name] = service_instance self.procs[service_instance.id] = service_instance # Add to directory service_instance.errcause = "registering" self.container.directory.register_safe("/Containers/%s/Processes" % self.container.id, service_instance.id, name=name) self.container.event_pub.publish_event(event_type="ProcessLifecycleEvent", origin=service_instance.id, origin_type="ContainerProcess", sub_type="SPAWN", container_id=self.container.id, process_type=service_instance._proc_type, process_name=service_instance._proc_name, state=ProcessStateEnum.SPAWN) def terminate_process(self, process_id): service_instance = self.procs.get(process_id, None) if not service_instance: raise BadRequest("Cannot terminate. Process id='%s' unknown on container id='%s'" % ( process_id, self.container.id)) service_instance.quit() # terminate IonProcessThread (may not have one, i.e. simple process) if service_instance._process: service_instance._process.notify_stop() service_instance._process.stop() del self.procs[process_id] self.container.directory.unregister_safe("/Containers/%s/Processes" % self.container.id, service_instance.id) # Cleanup for specific process types if service_instance._proc_type == "service": listen_name = get_safe(service_instance.CFG, "process.listen_name", service_instance.name) self.container.directory.unregister_safe("/Services/%s" % listen_name, service_instance.id) remaining_workers = self.container.directory.find_entries("/Services/%s" % listen_name) if remaining_workers and len(remaining_workers) == 2: self.container.directory.unregister_safe("/Services", listen_name) elif service_instance._proc_type == "agent": self.container.directory.unregister_safe("/Agents", service_instance.id) self.container.event_pub.publish_event(event_type="ProcessLifecycleEvent", origin=service_instance.id, origin_type="ContainerProcess", sub_type="TERMINATE", container_id=self.container.id, process_type=service_instance._proc_type, process_name=service_instance._proc_name, state=ProcessStateEnum.TERMINATE)
class ProcManager(object): def __init__(self, container): self.container = container # Define the callables that can be added to Container public API self.container_api = [self.spawn_process, self.terminate_process] # Add the public callables to Container for call in self.container_api: setattr(self.container, call.__name__, call) self.proc_id_pool = IDPool() # Temporary registry of running processes self.procs_by_name = {} self.procs = {} # mapping of greenlets we spawn to service_instances for error handling self._spawned_proc_to_process = {} # The pyon worker process supervisor self.proc_sup = IonProcessThreadManager(heartbeat_secs=CFG.cc.timeout.heartbeat, failure_notify_callback=self._spawned_proc_failed) def start(self): log.debug("ProcManager starting ...") self.proc_sup.start() # Register container as resource object cc_obj = CapabilityContainer(name=self.container.id, cc_agent=self.container.name) self.cc_id, _ = self.container.resource_registry.create(cc_obj) #Create an association to an Org object if not the rot ION org and only if found if CFG.container.org_name and CFG.container.org_name != CFG.system.root_org: org,_ = self.container.resource_registry.find_resources(restype=RT.Org,name=CFG.container.org_name, id_only=True ) if org: self.container.resource_registry.create_association(org[0],PRED.hasResource, self.cc_id) #TODO - replace with proper association log.debug("ProcManager started, OK.") def stop(self): log.debug("ProcManager stopping ...") from pyon.datastore.couchdb.couchdb_datastore import CouchDB_DataStore stats1 = CouchDB_DataStore._stats.get_stats() # Call quit on procs to give them ability to clean up # @TODO terminate_process is not gl-safe # gls = map(lambda k: spawn(self.terminate_process, k), self.procs.keys()) # join(gls) procs_list = sorted(self.procs.values(), key=lambda proc: proc._proc_start_time, reverse=True) for proc in procs_list: self.terminate_process(proc.id) # TODO: Have a choice of shutdown behaviors for waiting on children, timeouts, etc self.proc_sup.shutdown(CFG.cc.timeout.shutdown) if self.procs: log.warn("ProcManager procs not empty: %s", self.procs) if self.procs_by_name: log.warn("ProcManager procs_by_name not empty: %s", self.procs_by_name) # Remove Resource registration self.container.resource_registry.delete(self.cc_id, del_associations=True) # TODO: Check associations to processes stats2 = CouchDB_DataStore._stats.get_stats() stats3 = CouchDB_DataStore._stats.diff_stats(stats2, stats1) log.debug("Datastore stats difference during stop(): %s", stats3) log.debug("ProcManager stopped, OK.") def spawn_process(self, name=None, module=None, cls=None, config=None, process_id=None): """ Spawn a process within the container. Processes can be of different type. """ if process_id and not is_valid_identifier(process_id, ws_sub='_'): raise BadRequest("Given process_id %s is not a valid identifier" % process_id) # Generate a new process id if not provided # TODO: Ensure it is system-wide unique process_id = process_id or "%s.%s" % (self.container.id, self.proc_id_pool.get_id()) log.debug("ProcManager.spawn_process(name=%s, module.cls=%s.%s, config=%s) as pid=%s", name, module, cls, config, process_id) process_cfg = CFG.copy() if config: # Use provided config. Must be dict or DotDict if not isinstance(config, DotDict): config = DotDict(config) dict_merge(process_cfg, config, inplace=True) if self.container.spawn_args: # Override config with spawn args dict_merge(process_cfg, self.container.spawn_args, inplace=True) #log.debug("spawn_process() pid=%s process_cfg=%s", process_id, process_cfg) # PROCESS TYPE. Determines basic process context (messaging, service interface) # One of: service, stream_process, agent, simple, immediate service_cls = named_any("%s.%s" % (module, cls)) process_type = get_safe(process_cfg, "process.type") or getattr(service_cls, "process_type", "service") service_instance = None try: # spawn service by type if process_type == "service": service_instance = self._spawn_service_process(process_id, name, module, cls, process_cfg) elif process_type == "stream_process": service_instance = self._spawn_stream_process(process_id, name, module, cls, process_cfg) elif process_type == "agent": service_instance = self._spawn_agent_process(process_id, name, module, cls, process_cfg) elif process_type == "standalone": service_instance = self._spawn_standalone_process(process_id, name, module, cls, process_cfg) elif process_type == "immediate": service_instance = self._spawn_immediate_process(process_id, name, module, cls, process_cfg) elif process_type == "simple": service_instance = self._spawn_simple_process(process_id, name, module, cls, process_cfg) else: raise BadRequest("Unknown process type: %s" % process_type) service_instance._proc_type = process_type self._register_process(service_instance, name) service_instance.errcause = "OK" log.info("ProcManager.spawn_process: %s.%s -> pid=%s OK", module, cls, process_id) if process_type == 'immediate': log.info('Terminating immediate process: %s', service_instance.id) self.terminate_process(service_instance.id) return service_instance.id except Exception: errcause = service_instance.errcause if service_instance else "instantiating service" log.exception("Error spawning %s %s process (process_id: %s): %s", name, process_type, process_id, errcause) raise def list_local_processes(self, process_type=''): ''' Returns a list of the running ION processes in the container or filtered by the process_type ''' ret = list() for p in self.procs.values(): if process_type and p.process_type != process_type: continue ret.append(p) return ret def list_local_process_names(self, process_type=''): ''' Returns a list of the running ION processes in the container or filtered by the process_type ''' ret = list() for p in self.procs.values(): if process_type and p.process_type != process_type: continue ret.append(p.name) return ret def is_local_service_process(self, service_name): local_services = self.list_local_process_names('service') if service_name in local_services: return True return False def _spawned_proc_failed(self, gproc): log.error("ProcManager._spawned_proc_failed: %s, %s", gproc, gproc.exception) # for now - don't worry about the mapping, if we get a failure, just kill the container. # leave the mapping in place for potential expansion later. # # look it up in mapping # if not gproc in self._spawned_proc_to_process: # log.warn("No record of gproc %s in our map (%s)", gproc, self._spawned_proc_to_process) # return # svc = self._spawned_proc_to_process.get(gproc, "Unknown") # # # make sure svc is in our list # if not svc in self.procs.values(): # log.warn("svc %s not found in procs list", svc) # return self.container.fail_fast("Container process (%s) failed: %s" % (svc, gproc.exception)) def _cleanup_method(self, queue_name, ep=None): """ Common method to be passed to each spawned ION process to clean up their process-queue. @TODO Leaks implementation detail, should be using XOs """ if not ep._chan._queue_auto_delete: # only need to delete if AMQP didn't handle it for us already! # @TODO this will not work with XOs (future) ch = self.container.node.channel(RecvChannel) ch._recv_name = NameTrio(get_sys_name(), "%s.%s" % (get_sys_name(), queue_name)) ch._destroy_queue() #TODO - check with Michael if this is acceptable or if there is a better way. def _is_policy_management_service_available(self): """ Method to verify if the Policy Management Service is running in the system. """ policy_services, _ = self.container.resource_registry.find_resources(restype=RT.Service,name='policy_management') if policy_services: return True return False # ----------------------------------------------------------------- # PROCESS TYPE: service def _spawn_service_process(self, process_id, name, module, cls, config): """ Spawn a process acting as a service worker. Attach to service queue with service definition, attach to service pid """ service_instance = self._create_service_instance(process_id, name, module, cls, config) listen_name = get_safe(config, "process.listen_name") or service_instance.name log.debug("Service Process (%s) listen_name: %s", name, listen_name) service_instance._proc_listen_name = listen_name # Service RPC endpoint rsvc1 = ProcessRPCServer(node=self.container.node, from_name=listen_name, service=service_instance, process=service_instance) # Named local RPC endpoint rsvc2 = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) # cleanup method to delete process queue cleanup = lambda _: self._cleanup_method(service_instance.id, rsvc2) # Start an ION process with the right kind of endpoint factory proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc1, rsvc2], proc_name=service_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready(proc, "_spawn_service_process for %s" % ",".join((listen_name, service_instance.id))) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc self._service_init(service_instance) self._service_start(service_instance) proc.start_listeners() # look to load any existing policies for this service if self._is_policy_management_service_available() and self.container.governance_controller: self.container.governance_controller.update_service_access_policy(service_instance._proc_listen_name) return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: stream process def _spawn_stream_process(self, process_id, name, module, cls, config): """ Spawn a process acting as a data stream process. Attach to subscription queue with process function. """ service_instance = self._create_service_instance(process_id, name, module, cls, config) listen_name = get_safe(config, "process.listen_name") or name service_instance._proc_listen_name = listen_name service_instance.stream_subscriber_registrar = StreamSubscriberRegistrar(process=service_instance, container=self.container) sub = service_instance.stream_subscriber_registrar.create_subscriber(exchange_name=listen_name) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") self._set_publisher_endpoints(service_instance, publish_streams) rsvc = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) cleanup = lambda _: self._cleanup_method(service_instance.id, rsvc) proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc, sub], proc_name=service_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready(proc, "_spawn_stream_process for %s" % service_instance._proc_name) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc self._service_init(service_instance) self._service_start(service_instance) proc.start_listeners() return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: agent def _spawn_agent_process(self, process_id, name, module, cls, config): """ Spawn a process acting as agent process. Attach to service pid. """ service_instance = self._create_service_instance(process_id, name, module, cls, config) if not isinstance(service_instance, ResourceAgent): raise ContainerConfigError("Agent process must extend ResourceAgent") # Set the resource ID if we get it through the config resource_id = get_safe(service_instance.CFG, "agent.resource_id") if resource_id: service_instance.resource_id = resource_id rsvc = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) cleanup = lambda _: self._cleanup_method(service_instance.id, rsvc) proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc], proc_name=service_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready(proc, "_spawn_agent_process for %s" % service_instance.id) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc # Now call the on_init of the agent. self._service_init(service_instance) if not service_instance.resource_id: log.warn("New agent pid=%s has no resource_id set" % process_id) self._service_start(service_instance) proc.start_listeners() if service_instance.resource_id: # look to load any existing policies for this resource if self._is_policy_management_service_available() and self.container.governance_controller: self.container.governance_controller.update_resource_access_policy(service_instance.resource_id) else: log.warn("Agent process id=%s does not define resource_id!!" % service_instance.id) return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: standalone def _spawn_standalone_process(self, process_id, name, module, cls, config): """ Spawn a process acting as standalone process. Attach to service pid. """ service_instance = self._create_service_instance(process_id, name, module, cls, config) rsvc = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) cleanup = lambda _: self._cleanup_method(service_instance.id, rsvc) proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc], proc_name=service_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready(proc, "_spawn_standalone_process for %s" % service_instance.id) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc self._service_init(service_instance) self._service_start(service_instance) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") self._set_publisher_endpoints(service_instance, publish_streams) proc.start_listeners() return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: simple def _spawn_simple_process(self, process_id, name, module, cls, config): """ Spawn a process acting as simple process. No attachments. """ service_instance = self._create_service_instance(process_id, name, module, cls, config) self._service_init(service_instance) self._service_start(service_instance) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") self._set_publisher_endpoints(service_instance, publish_streams) return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: immediate def _spawn_immediate_process(self, process_id, name, module, cls, config): """ Spawn a process acting as immediate one off process. No attachments. """ service_instance = self._create_service_instance(process_id, name, module, cls, config) self._service_init(service_instance) self._service_start(service_instance) return service_instance def _create_service_instance(self, process_id, name, module, cls, config): """ Creates an instance of a "service", be it a Service, Agent, Stream, etc. @rtype BaseService @return An instance of a "service" """ # SERVICE INSTANCE. service_instance = for_name(module, cls) if not isinstance(service_instance, BaseService): raise ContainerConfigError("Instantiated service not a BaseService %r" % service_instance) # Prepare service instance service_instance.errcause = "" service_instance.id = process_id service_instance.container = self.container service_instance.CFG = config service_instance._proc_name = name service_instance._proc_start_time = time.time() # start service dependencies (RPC clients) self._start_service_dependencies(service_instance) return service_instance def _start_service_dependencies(self, service_instance): service_instance.errcause = "setting service dependencies" log.debug("spawn_process dependencies: %s", service_instance.dependencies) # TODO: Service dependency != process dependency for dependency in service_instance.dependencies: client = getattr(service_instance.clients, dependency) assert client, "Client for dependency not found: %s" % dependency # @TODO: should be in a start_client in RPCClient chain client.process = service_instance client.node = self.container.node # ensure that dep actually exists and is running # MM: commented out - during startup (init actually), we don't need to check for service dependencies # MM: TODO: split on_init from on_start; start consumer in on_start; check for full queues on restart # if service_instance.name != 'bootstrap' or (service_instance.name == 'bootstrap' and service_instance.CFG.level == dependency): # svc_de = self.container.resource_registry.find_resources(restype="Service", name=dependency, id_only=True) # if not svc_de: # raise ContainerConfigError("Dependency for service %s not running: %s" % (service_instance.name, dependency)) def _service_init(self, service_instance): # Init process service_instance.errcause = "initializing service" service_instance.init() def _service_start(self, service_instance): # Start process # TODO: Check for timeout service_instance.errcause = "starting service" service_instance.start() def _set_publisher_endpoints(self, service_instance, publisher_streams=None): service_instance.stream_publisher_registrar = StreamPublisherRegistrar(process=service_instance, container=self.container) publisher_streams = publisher_streams or {} for name, stream_id in publisher_streams.iteritems(): # problem is here pub = service_instance.stream_publisher_registrar.create_publisher(stream_id) setattr(service_instance, name, pub) def _register_process(self, service_instance, name): """ Performs all actions related to registering the new process in the system. Also performs process type specific registration, such as for services and agents """ # Add process instance to container's process dict if name in self.procs_by_name: log.warn("Process name already registered in container: %s" % name) self.procs_by_name[name] = service_instance self.procs[service_instance.id] = service_instance # Add Process to resource registry # Note: In general the Process resource should be created by the CEI PD, but not all processes are CEI # processes. How to deal with this? service_instance.errcause = "registering" if service_instance._proc_type != "immediate": proc_obj = Process(name=service_instance.id, label=name, proctype=service_instance._proc_type) proc_id, _ = self.container.resource_registry.create(proc_obj) service_instance._proc_res_id = proc_id # Associate process with container resource self.container.resource_registry.create_association(self.cc_id, "hasProcess", proc_id) else: service_instance._proc_res_id = None # Process type specific registration # TODO: Factor out into type specific handler functions if service_instance._proc_type == "service": # Registration of SERVICE process: in resource registry service_list, _ = self.container.resource_registry.find_resources(restype="Service", name=service_instance.name) if service_list: service_instance._proc_svc_id = service_list[0]._id else: # We are starting the first process of a service instance # TODO: This should be created by the HA Service agent in the future svc_obj = Service(name=service_instance.name, exchange_name=service_instance._proc_listen_name) service_instance._proc_svc_id, _ = self.container.resource_registry.create(svc_obj) # Create association to service definition resource svcdef_list, _ = self.container.resource_registry.find_resources(restype="ServiceDefinition", name=service_instance.name) if svcdef_list: self.container.resource_registry.create_association(service_instance._proc_svc_id, "hasServiceDefinition", svcdef_list[0]._id) else: log.error("Cannot find ServiceDefinition resource for %s", service_instance.name) self.container.resource_registry.create_association(service_instance._proc_svc_id, "hasProcess", proc_id) elif service_instance._proc_type == "agent": # Registration of AGENT process: in Directory caps = service_instance.get_capabilities() self.container.directory.register("/Agents", service_instance.id, **dict(name=service_instance._proc_name, container=service_instance.container.id, resource_id=service_instance.resource_id, agent_id=service_instance.agent_id, def_id=service_instance.agent_def_id, capabilities=caps)) # Trigger a real-time event. At this time, everything persistent has to be completed and consistent. self.container.event_pub.publish_event(event_type="ProcessLifecycleEvent", origin=service_instance.id, origin_type="ContainerProcess", sub_type="SPAWN", container_id=self.container.id, process_type=service_instance._proc_type, process_name=service_instance._proc_name, state=ProcessStateEnum.SPAWN) def terminate_process(self, process_id): """ Terminates a process and all its resources. Termination is graceful with timeout. """ log.debug("terminate_process: %s", process_id) service_instance = self.procs.get(process_id, None) if not service_instance: raise BadRequest("Cannot terminate. Process id='%s' unknown on container id='%s'" % ( process_id, self.container.id)) # Give the process notice to quit doing stuff. service_instance.quit() # Terminate IonProcessThread (may not have one, i.e. simple process) if service_instance._process: service_instance._process.notify_stop() service_instance._process.stop() self._unregister_process(process_id, service_instance) # Send out real-time notice that process was terminated. At this point, everything persistent # has to be consistent. self.container.event_pub.publish_event(event_type="ProcessLifecycleEvent", origin=service_instance.id, origin_type="ContainerProcess", sub_type="TERMINATE", container_id=self.container.id, process_type=service_instance._proc_type, process_name=service_instance._proc_name, state=ProcessStateEnum.TERMINATE) def _unregister_process(self, process_id, service_instance): # Remove process registration in resource registry if service_instance._proc_res_id: self.container.resource_registry.delete(service_instance._proc_res_id, del_associations=True) # Cleanup for specific process types if service_instance._proc_type == "service": # Check if this is the last process for this service and do auto delete service resources here svcproc_list, _ = self.container.resource_registry.find_objects(service_instance._proc_svc_id, "hasProcess", "Process", id_only=True) if not svcproc_list: self.container.resource_registry.delete(service_instance._proc_svc_id, del_associations=True) elif service_instance._proc_type == "agent": self.container.directory.unregister_safe("/Agents", service_instance.id) # Remove internal registration in container del self.procs[process_id] if service_instance._proc_name in self.procs_by_name: del self.procs_by_name[service_instance._proc_name] else: log.warn("Process name %s not in local registry", service_instance.name)
class ProcManager(object): def __init__(self, container): self.container = container # Define the callables that can be added to Container public API self.container_api = [self.spawn_process, self.terminate_process] # Add the public callables to Container for call in self.container_api: setattr(self.container, call.__name__, call) self.proc_id_pool = IDPool() # Temporary registry of running processes self.procs_by_name = {} self.procs = {} # mapping of greenlets we spawn to service_instances for error handling self._spawned_proc_to_process = {} # The pyon worker process supervisor self.proc_sup = IonProcessThreadManager( heartbeat_secs=CFG.cc.timeout.heartbeat, failure_notify_callback=self._spawned_proc_failed) def start(self): log.debug("ProcManager starting ...") self.proc_sup.start() log.debug("ProcManager started, OK.") def stop(self): log.debug("ProcManager stopping ...") # Call quit on procs to give them ability to clean up # @TODO terminate_process is not gl-safe # gls = map(lambda k: spawn(self.terminate_process, k), self.procs.keys()) # join(gls) map(self.terminate_process, self.procs.keys()) # TODO: Have a choice of shutdown behaviors for waiting on children, timeouts, etc self.proc_sup.shutdown(CFG.cc.timeout.shutdown) log.debug("ProcManager stopped, OK.") def spawn_process(self, name=None, module=None, cls=None, config=None, process_id=None): """ Spawn a process within the container. Processes can be of different type. """ if process_id and not is_valid_identifier(process_id, ws_sub='_'): raise BadRequest("Given process_id %s is not a valid identifier" % process_id) # Generate a new process id if not provided # TODO: Ensure it is system-wide unique process_id = process_id or "%s.%s" % (self.container.id, self.proc_id_pool.get_id()) log.debug( "ProcManager.spawn_process(name=%s, module.cls=%s.%s) as pid=%s", name, module, cls, process_id) if not config: # Use system CFG. It has the command line args in it config = DictModifier(CFG) else: # Use provided config. Must be dict or DotDict if not isinstance(config, DotDict): config = DotDict(config) config = DictModifier(CFG, config) if self.container.spawn_args: # Override config with spawn args dict_merge(config, self.container.spawn_args, inplace=True) #log.debug("spawn_process() pid=%s config=%s", process_id, config) # PROCESS TYPE. Determines basic process context (messaging, service interface) # One of: service, stream_process, agent, simple, immediate service_cls = named_any("%s.%s" % (module, cls)) process_type = get_safe(config, "process.type") or getattr( service_cls, "process_type", "service") service_instance = None try: # spawn service by type if process_type == "service": service_instance = self._spawn_service_process( process_id, name, module, cls, config) elif process_type == "stream_process": service_instance = self._spawn_stream_process( process_id, name, module, cls, config) elif process_type == "agent": service_instance = self._spawn_agent_process( process_id, name, module, cls, config) elif process_type == "standalone": service_instance = self._spawn_standalone_process( process_id, name, module, cls, config) elif process_type == "immediate": service_instance = self._spawn_immediate_process( process_id, name, module, cls, config) elif process_type == "simple": service_instance = self._spawn_simple_process( process_id, name, module, cls, config) else: raise BadRequest("Unknown process type: %s" % process_type) service_instance._proc_type = process_type self._register_process(service_instance, name) service_instance.errcause = "OK" log.info("AppManager.spawn_process: %s.%s -> pid=%s OK", module, cls, process_id) if process_type == 'immediate': log.info('Terminating immediate process: %s', service_instance.id) self.terminate_process(service_instance.id) return service_instance.id except Exception: errcause = service_instance.errcause if service_instance else "instantiating service" log.exception("Error spawning %s %s process (process_id: %s): %s", name, process_type, process_id, errcause) raise def _spawned_proc_failed(self, gproc): log.error("ProcManager._spawned_proc_failed: %s", gproc) # for now - don't worry about the mapping, if we get a failure, just kill the container. # leave the mapping in place for potential expansion later. # # look it up in mapping # if not gproc in self._spawned_proc_to_process: # log.warn("No record of gproc %s in our map (%s)", gproc, self._spawned_proc_to_process) # return # svc = self._spawned_proc_to_process.get(gproc, "Unknown") # # # make sure svc is in our list # if not svc in self.procs.values(): # log.warn("svc %s not found in procs list", svc) # return self.container.fail_fast("Container process (%s) failed: %s" % (svc, gproc.exception)) # ----------------------------------------------------------------- # PROCESS TYPE: service def _spawn_service_process(self, process_id, name, module, cls, config): """ Spawn a process acting as a service worker. Attach to service queue with service definition, attach to service pid """ service_instance = self._create_service_instance( process_id, name, module, cls, config) self._service_init(service_instance) self._service_start(service_instance) listen_name = get_safe(config, "process.listen_name") or service_instance.name log.debug("Service Process (%s) listen_name: %s", name, listen_name) # Service RPC endpoint rsvc1 = ProcessRPCServer(node=self.container.node, from_name=listen_name, service=service_instance, process=service_instance) # Named local RPC endpoint rsvc2 = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) # Start an ION process with the right kind of endpoint factory proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc1, rsvc2], proc_name=service_instance._proc_name) self.proc_sup.ensure_ready( proc, "_spawn_service_process for %s" % ",".join( (listen_name, service_instance.id))) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc # Directory registration self.container.directory.register_safe("/Services", listen_name, interface=service_instance.name) self.container.directory.register_safe("/Services/%s" % listen_name, service_instance.id) return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: stream process def _spawn_stream_process(self, process_id, name, module, cls, config): """ Spawn a process acting as a data stream process. Attach to subscription queue with process function. """ service_instance = self._create_service_instance( process_id, name, module, cls, config) self._service_init(service_instance) # Start the service self._service_start(service_instance) listen_name = get_safe(config, "process.listen_name") or name service_instance.stream_subscriber_registrar = StreamSubscriberRegistrar( process=service_instance, node=self.container.node) sub = service_instance.stream_subscriber_registrar.create_subscriber( exchange_name=listen_name) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") self._set_publisher_endpoints(service_instance, publish_streams) rsvc = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc, sub], proc_name=service_instance._proc_name) self.proc_sup.ensure_ready( proc, "_spawn_stream_process for %s" % service_instance._proc_name) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: agent def _spawn_agent_process(self, process_id, name, module, cls, config): """ Spawn a process acting as agent process. Attach to service pid. """ service_instance = self._create_service_instance( process_id, name, module, cls, config) if not isinstance(service_instance, ResourceAgent): raise ContainerConfigError( "Agent process must extend ResourceAgent") # Set the resource ID if we get it through the config resource_id = get_safe(service_instance.CFG, "agent.resource_id") if resource_id: service_instance.resource_id = resource_id # Now call the on_init of the agent. self._service_init(service_instance) if not service_instance.resource_id: log.warn("New agent pid=%s has no resource_id set" % process_id) self._service_start(service_instance) rsvc = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc], proc_name=service_instance._proc_name) self.proc_sup.ensure_ready( proc, "_spawn_agent_process for %s" % service_instance.id) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc # Directory registration caps = service_instance.get_capabilities() self.container.directory.register( "/Agents", service_instance.id, **dict(name=service_instance._proc_name, container=service_instance.container.id, resource_id=service_instance.resource_id, agent_id=service_instance.agent_id, def_id=service_instance.agent_def_id, capabilities=caps)) if not service_instance.resource_id: log.warn("Agent process id=%s does not define resource_id!!" % service_instance.id) return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: standalone def _spawn_standalone_process(self, process_id, name, module, cls, config): """ Spawn a process acting as standalone process. Attach to service pid. """ service_instance = self._create_service_instance( process_id, name, module, cls, config) self._service_init(service_instance) self._service_start(service_instance) rsvc = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc], proc_name=service_instance._proc_name) self.proc_sup.ensure_ready( proc, "_spawn_standalone_process for %s" % service_instance.id) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") self._set_publisher_endpoints(service_instance, publish_streams) return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: simple def _spawn_simple_process(self, process_id, name, module, cls, config): """ Spawn a process acting as simple process. No attachments. """ service_instance = self._create_service_instance( process_id, name, module, cls, config) self._service_init(service_instance) self._service_start(service_instance) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") self._set_publisher_endpoints(service_instance, publish_streams) return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: immediate def _spawn_immediate_process(self, process_id, name, module, cls, config): """ Spawn a process acting as immediate one off process. No attachments. """ service_instance = self._create_service_instance( process_id, name, module, cls, config) self._service_init(service_instance) self._service_start(service_instance) return service_instance def _create_service_instance(self, process_id, name, module, cls, config): """ Creates an instance of a "service", be it a Service, Agent, Stream, etc. @rtype BaseService @return An instance of a "service" """ # SERVICE INSTANCE. service_instance = for_name(module, cls) if not isinstance(service_instance, BaseService): raise ContainerConfigError( "Instantiated service not a BaseService %r" % service_instance) # Prepare service instance service_instance.errcause = "" service_instance.id = process_id service_instance.container = self.container service_instance.CFG = config service_instance._proc_name = name # start service dependencies (RPC clients) self._start_service_dependencies(service_instance) return service_instance def _start_service_dependencies(self, service_instance): service_instance.errcause = "setting service dependencies" log.debug("spawn_process dependencies: %s", service_instance.dependencies) # TODO: Service dependency != process dependency for dependency in service_instance.dependencies: client = getattr(service_instance.clients, dependency) assert client, "Client for dependency not found: %s" % dependency # @TODO: should be in a start_client in RPCClient chain client.process = service_instance client.node = self.container.node # ensure that dep actually exists and is running if service_instance.name != 'bootstrap' or ( service_instance.name == 'bootstrap' and service_instance.CFG.level == dependency): svc_de = self.container.directory.lookup("/Services/%s" % dependency) if svc_de is None: raise ContainerConfigError( "Dependency for service %s not running: %s" % (service_instance.name, dependency)) def _service_init(self, service_instance): # Init process service_instance.errcause = "initializing service" service_instance.init() def _service_start(self, service_instance): # Start process # TODO: Check for timeout service_instance.errcause = "starting service" service_instance.start() def _set_publisher_endpoints(self, service_instance, publisher_streams=None): service_instance.stream_publisher_registrar = StreamPublisherRegistrar( process=service_instance, node=self.container.node) publisher_streams = publisher_streams or {} for name, stream_id in publisher_streams.iteritems(): # problem is here pub = service_instance.stream_publisher_registrar.create_publisher( stream_id) setattr(service_instance, name, pub) def _register_process(self, service_instance, name): # Add to local process dict self.procs_by_name[name] = service_instance self.procs[service_instance.id] = service_instance # Add to directory service_instance.errcause = "registering" self.container.directory.register_safe("/Containers/%s/Processes" % self.container.id, service_instance.id, name=name) self.container.event_pub.publish_event( event_type="ProcessLifecycleEvent", origin=service_instance.id, origin_type="ContainerProcess", sub_type="SPAWN", container_id=self.container.id, process_type=service_instance._proc_type, process_name=service_instance._proc_name, state=ProcessStateEnum.SPAWN) def terminate_process(self, process_id): service_instance = self.procs.get(process_id, None) if not service_instance: raise BadRequest( "Cannot terminate. Process id='%s' unknown on container id='%s'" % (process_id, self.container.id)) service_instance.quit() # terminate IonProcessThread (may not have one, i.e. simple process) if service_instance._process: service_instance._process.notify_stop() service_instance._process.stop() del self.procs[process_id] self.container.directory.unregister_safe( "/Containers/%s/Processes" % self.container.id, service_instance.id) # Cleanup for specific process types if service_instance._proc_type == "service": listen_name = get_safe(service_instance.CFG, "process.listen_name", service_instance.name) self.container.directory.unregister_safe( "/Services/%s" % listen_name, service_instance.id) remaining_workers = self.container.directory.find_entries( "/Services/%s" % listen_name) if remaining_workers and len(remaining_workers) == 2: self.container.directory.unregister_safe( "/Services", listen_name) elif service_instance._proc_type == "agent": self.container.directory.unregister_safe("/Agents", service_instance.id) self.container.event_pub.publish_event( event_type="ProcessLifecycleEvent", origin=service_instance.id, origin_type="ContainerProcess", sub_type="TERMINATE", container_id=self.container.id, process_type=service_instance._proc_type, process_name=service_instance._proc_name, state=ProcessStateEnum.TERMINATE)
class ProcManager(object): def __init__(self, container): self.container = container # Define the callables that can be added to Container public API self.container_api = [self.spawn_process, self.terminate_process] # Add the public callables to Container for call in self.container_api: setattr(self.container, call.__name__, call) self.proc_id_pool = IDPool() # Temporary registry of running processes self.procs_by_name = {} self.procs = {} # mapping of greenlets we spawn to service_instances for error handling self._spawned_proc_to_process = {} # The pyon worker process supervisor self.proc_sup = IonProcessThreadManager( heartbeat_secs=CFG.cc.timeout.heartbeat, failure_notify_callback=self._spawned_proc_failed) def start(self): log.debug("ProcManager starting ...") self.proc_sup.start() # Register container as resource object cc_obj = CapabilityContainer(name=self.container.id, cc_agent=self.container.name) self.cc_id, _ = self.container.resource_registry.create(cc_obj) #Create an association to an Org object if not the rot ION org and only if found if CFG.container.org_name and CFG.container.org_name != CFG.system.root_org: org, _ = self.container.resource_registry.find_resources( restype=RT.Org, name=CFG.container.org_name, id_only=True) if org: self.container.resource_registry.create_association( org[0], PRED.hasResource, self.cc_id) #TODO - replace with proper association log.debug("ProcManager started, OK.") def stop(self): log.debug("ProcManager stopping ...") from pyon.datastore.couchdb.couchdb_datastore import CouchDB_DataStore stats1 = CouchDB_DataStore._stats.get_stats() # Call quit on procs to give them ability to clean up # @TODO terminate_process is not gl-safe # gls = map(lambda k: spawn(self.terminate_process, k), self.procs.keys()) # join(gls) procs_list = sorted(self.procs.values(), key=lambda proc: proc._proc_start_time, reverse=True) for proc in procs_list: self.terminate_process(proc.id) # TODO: Have a choice of shutdown behaviors for waiting on children, timeouts, etc self.proc_sup.shutdown(CFG.cc.timeout.shutdown) if self.procs: log.warn("ProcManager procs not empty: %s", self.procs) if self.procs_by_name: log.warn("ProcManager procs_by_name not empty: %s", self.procs_by_name) # Remove Resource registration self.container.resource_registry.delete(self.cc_id, del_associations=True) # TODO: Check associations to processes stats2 = CouchDB_DataStore._stats.get_stats() stats3 = CouchDB_DataStore._stats.diff_stats(stats2, stats1) log.debug("Datastore stats difference during stop(): %s", stats3) log.debug("ProcManager stopped, OK.") def spawn_process(self, name=None, module=None, cls=None, config=None, process_id=None): """ Spawn a process within the container. Processes can be of different type. """ if process_id and not is_valid_identifier(process_id, ws_sub='_'): raise BadRequest("Given process_id %s is not a valid identifier" % process_id) # Generate a new process id if not provided # TODO: Ensure it is system-wide unique process_id = process_id or "%s.%s" % (self.container.id, self.proc_id_pool.get_id()) log.debug( "ProcManager.spawn_process(name=%s, module.cls=%s.%s, config=%s) as pid=%s", name, module, cls, config, process_id) process_cfg = CFG.copy() if config: # Use provided config. Must be dict or DotDict if not isinstance(config, DotDict): config = DotDict(config) dict_merge(process_cfg, config, inplace=True) if self.container.spawn_args: # Override config with spawn args dict_merge(process_cfg, self.container.spawn_args, inplace=True) #log.debug("spawn_process() pid=%s process_cfg=%s", process_id, process_cfg) # PROCESS TYPE. Determines basic process context (messaging, service interface) # One of: service, stream_process, agent, simple, immediate service_cls = named_any("%s.%s" % (module, cls)) process_type = get_safe(process_cfg, "process.type") or getattr( service_cls, "process_type", "service") service_instance = None try: # spawn service by type if process_type == "service": service_instance = self._spawn_service_process( process_id, name, module, cls, process_cfg) elif process_type == "stream_process": service_instance = self._spawn_stream_process( process_id, name, module, cls, process_cfg) elif process_type == "agent": service_instance = self._spawn_agent_process( process_id, name, module, cls, process_cfg) elif process_type == "standalone": service_instance = self._spawn_standalone_process( process_id, name, module, cls, process_cfg) elif process_type == "immediate": service_instance = self._spawn_immediate_process( process_id, name, module, cls, process_cfg) elif process_type == "simple": service_instance = self._spawn_simple_process( process_id, name, module, cls, process_cfg) else: raise BadRequest("Unknown process type: %s" % process_type) service_instance._proc_type = process_type self._register_process(service_instance, name) service_instance.errcause = "OK" log.info("ProcManager.spawn_process: %s.%s -> pid=%s OK", module, cls, process_id) if process_type == 'immediate': log.info('Terminating immediate process: %s', service_instance.id) self.terminate_process(service_instance.id) return service_instance.id except Exception: errcause = service_instance.errcause if service_instance else "instantiating service" log.exception("Error spawning %s %s process (process_id: %s): %s", name, process_type, process_id, errcause) raise def list_local_processes(self, process_type=''): ''' Returns a list of the running ION processes in the container or filtered by the process_type ''' ret = list() for p in self.procs.values(): if process_type and p.process_type != process_type: continue ret.append(p) return ret def list_local_process_names(self, process_type=''): ''' Returns a list of the running ION processes in the container or filtered by the process_type ''' ret = list() for p in self.procs.values(): if process_type and p.process_type != process_type: continue ret.append(p.name) return ret def is_local_service_process(self, service_name): local_services = self.list_local_process_names('service') if service_name in local_services: return True return False def _spawned_proc_failed(self, gproc): log.error("ProcManager._spawned_proc_failed: %s, %s", gproc, gproc.exception) # for now - don't worry about the mapping, if we get a failure, just kill the container. # leave the mapping in place for potential expansion later. # # look it up in mapping # if not gproc in self._spawned_proc_to_process: # log.warn("No record of gproc %s in our map (%s)", gproc, self._spawned_proc_to_process) # return # svc = self._spawned_proc_to_process.get(gproc, "Unknown") # # # make sure svc is in our list # if not svc in self.procs.values(): # log.warn("svc %s not found in procs list", svc) # return self.container.fail_fast("Container process (%s) failed: %s" % (svc, gproc.exception)) def _cleanup_method(self, queue_name, ep=None): """ Common method to be passed to each spawned ION process to clean up their process-queue. @TODO Leaks implementation detail, should be using XOs """ if not ep._chan._queue_auto_delete: # only need to delete if AMQP didn't handle it for us already! # @TODO this will not work with XOs (future) ch = self.container.node.channel(RecvChannel) ch._recv_name = NameTrio(get_sys_name(), "%s.%s" % (get_sys_name(), queue_name)) ch._destroy_queue() #TODO - check with Michael if this is acceptable or if there is a better way. def _is_policy_management_service_available(self): """ Method to verify if the Policy Management Service is running in the system. """ policy_services, _ = self.container.resource_registry.find_resources( restype=RT.Service, name='policy_management') if policy_services: return True return False # ----------------------------------------------------------------- # PROCESS TYPE: service def _spawn_service_process(self, process_id, name, module, cls, config): """ Spawn a process acting as a service worker. Attach to service queue with service definition, attach to service pid """ service_instance = self._create_service_instance( process_id, name, module, cls, config) listen_name = get_safe(config, "process.listen_name") or service_instance.name log.debug("Service Process (%s) listen_name: %s", name, listen_name) service_instance._proc_listen_name = listen_name # Service RPC endpoint rsvc1 = ProcessRPCServer(node=self.container.node, from_name=listen_name, service=service_instance, process=service_instance) # Named local RPC endpoint rsvc2 = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) # cleanup method to delete process queue cleanup = lambda _: self._cleanup_method(service_instance.id, rsvc2) # Start an ION process with the right kind of endpoint factory proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc1, rsvc2], proc_name=service_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready( proc, "_spawn_service_process for %s" % ",".join( (listen_name, service_instance.id))) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc self._service_init(service_instance) self._service_start(service_instance) proc.start_listeners() # look to load any existing policies for this service if self._is_policy_management_service_available( ) and self.container.governance_controller: self.container.governance_controller.update_service_access_policy( service_instance._proc_listen_name) return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: stream process def _spawn_stream_process(self, process_id, name, module, cls, config): """ Spawn a process acting as a data stream process. Attach to subscription queue with process function. """ service_instance = self._create_service_instance( process_id, name, module, cls, config) listen_name = get_safe(config, "process.listen_name") or name service_instance._proc_listen_name = listen_name service_instance.stream_subscriber_registrar = StreamSubscriberRegistrar( process=service_instance, container=self.container) sub = service_instance.stream_subscriber_registrar.create_subscriber( exchange_name=listen_name) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") self._set_publisher_endpoints(service_instance, publish_streams) rsvc = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) cleanup = lambda _: self._cleanup_method(service_instance.id, rsvc) proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc, sub], proc_name=service_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready( proc, "_spawn_stream_process for %s" % service_instance._proc_name) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc self._service_init(service_instance) self._service_start(service_instance) proc.start_listeners() return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: agent def _spawn_agent_process(self, process_id, name, module, cls, config): """ Spawn a process acting as agent process. Attach to service pid. """ service_instance = self._create_service_instance( process_id, name, module, cls, config) if not isinstance(service_instance, ResourceAgent): raise ContainerConfigError( "Agent process must extend ResourceAgent") # Set the resource ID if we get it through the config resource_id = get_safe(service_instance.CFG, "agent.resource_id") if resource_id: service_instance.resource_id = resource_id rsvc = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) cleanup = lambda _: self._cleanup_method(service_instance.id, rsvc) proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc], proc_name=service_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready( proc, "_spawn_agent_process for %s" % service_instance.id) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc # Now call the on_init of the agent. self._service_init(service_instance) if not service_instance.resource_id: log.warn("New agent pid=%s has no resource_id set" % process_id) self._service_start(service_instance) proc.start_listeners() if service_instance.resource_id: # look to load any existing policies for this resource if self._is_policy_management_service_available( ) and self.container.governance_controller: self.container.governance_controller.update_resource_access_policy( service_instance.resource_id) else: log.warn("Agent process id=%s does not define resource_id!!" % service_instance.id) return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: standalone def _spawn_standalone_process(self, process_id, name, module, cls, config): """ Spawn a process acting as standalone process. Attach to service pid. """ service_instance = self._create_service_instance( process_id, name, module, cls, config) rsvc = ProcessRPCServer(node=self.container.node, from_name=service_instance.id, service=service_instance, process=service_instance) # cleanup method to delete process queue (@TODO: leaks a bit here - should use XOs) cleanup = lambda _: self._cleanup_method(service_instance.id, rsvc) proc = self.proc_sup.spawn(name=service_instance.id, service=service_instance, listeners=[rsvc], proc_name=service_instance._proc_name, cleanup_method=cleanup) self.proc_sup.ensure_ready( proc, "_spawn_standalone_process for %s" % service_instance.id) # map gproc to service_instance self._spawned_proc_to_process[proc.proc] = service_instance # set service's reference to process service_instance._process = proc self._service_init(service_instance) self._service_start(service_instance) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") self._set_publisher_endpoints(service_instance, publish_streams) proc.start_listeners() return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: simple def _spawn_simple_process(self, process_id, name, module, cls, config): """ Spawn a process acting as simple process. No attachments. """ service_instance = self._create_service_instance( process_id, name, module, cls, config) self._service_init(service_instance) self._service_start(service_instance) # Add publishers if any... publish_streams = get_safe(config, "process.publish_streams") self._set_publisher_endpoints(service_instance, publish_streams) return service_instance # ----------------------------------------------------------------- # PROCESS TYPE: immediate def _spawn_immediate_process(self, process_id, name, module, cls, config): """ Spawn a process acting as immediate one off process. No attachments. """ service_instance = self._create_service_instance( process_id, name, module, cls, config) self._service_init(service_instance) self._service_start(service_instance) return service_instance def _create_service_instance(self, process_id, name, module, cls, config): """ Creates an instance of a "service", be it a Service, Agent, Stream, etc. @rtype BaseService @return An instance of a "service" """ # SERVICE INSTANCE. service_instance = for_name(module, cls) if not isinstance(service_instance, BaseService): raise ContainerConfigError( "Instantiated service not a BaseService %r" % service_instance) # Prepare service instance service_instance.errcause = "" service_instance.id = process_id service_instance.container = self.container service_instance.CFG = config service_instance._proc_name = name service_instance._proc_start_time = time.time() # start service dependencies (RPC clients) self._start_service_dependencies(service_instance) return service_instance def _start_service_dependencies(self, service_instance): service_instance.errcause = "setting service dependencies" log.debug("spawn_process dependencies: %s", service_instance.dependencies) # TODO: Service dependency != process dependency for dependency in service_instance.dependencies: client = getattr(service_instance.clients, dependency) assert client, "Client for dependency not found: %s" % dependency # @TODO: should be in a start_client in RPCClient chain client.process = service_instance client.node = self.container.node # ensure that dep actually exists and is running # MM: commented out - during startup (init actually), we don't need to check for service dependencies # MM: TODO: split on_init from on_start; start consumer in on_start; check for full queues on restart # if service_instance.name != 'bootstrap' or (service_instance.name == 'bootstrap' and service_instance.CFG.level == dependency): # svc_de = self.container.resource_registry.find_resources(restype="Service", name=dependency, id_only=True) # if not svc_de: # raise ContainerConfigError("Dependency for service %s not running: %s" % (service_instance.name, dependency)) def _service_init(self, service_instance): # Init process service_instance.errcause = "initializing service" service_instance.init() def _service_start(self, service_instance): # Start process # TODO: Check for timeout service_instance.errcause = "starting service" service_instance.start() def _set_publisher_endpoints(self, service_instance, publisher_streams=None): service_instance.stream_publisher_registrar = StreamPublisherRegistrar( process=service_instance, container=self.container) publisher_streams = publisher_streams or {} for name, stream_id in publisher_streams.iteritems(): # problem is here pub = service_instance.stream_publisher_registrar.create_publisher( stream_id) setattr(service_instance, name, pub) def _register_process(self, service_instance, name): """ Performs all actions related to registering the new process in the system. Also performs process type specific registration, such as for services and agents """ # Add process instance to container's process dict if name in self.procs_by_name: log.warn("Process name already registered in container: %s" % name) self.procs_by_name[name] = service_instance self.procs[service_instance.id] = service_instance # Add Process to resource registry # Note: In general the Process resource should be created by the CEI PD, but not all processes are CEI # processes. How to deal with this? service_instance.errcause = "registering" if service_instance._proc_type != "immediate": proc_obj = Process(name=service_instance.id, label=name, proctype=service_instance._proc_type) proc_id, _ = self.container.resource_registry.create(proc_obj) service_instance._proc_res_id = proc_id # Associate process with container resource self.container.resource_registry.create_association( self.cc_id, "hasProcess", proc_id) else: service_instance._proc_res_id = None # Process type specific registration # TODO: Factor out into type specific handler functions if service_instance._proc_type == "service": # Registration of SERVICE process: in resource registry service_list, _ = self.container.resource_registry.find_resources( restype="Service", name=service_instance.name) if service_list: service_instance._proc_svc_id = service_list[0]._id else: # We are starting the first process of a service instance # TODO: This should be created by the HA Service agent in the future svc_obj = Service( name=service_instance.name, exchange_name=service_instance._proc_listen_name) service_instance._proc_svc_id, _ = self.container.resource_registry.create( svc_obj) # Create association to service definition resource svcdef_list, _ = self.container.resource_registry.find_resources( restype="ServiceDefinition", name=service_instance.name) if svcdef_list: self.container.resource_registry.create_association( service_instance._proc_svc_id, "hasServiceDefinition", svcdef_list[0]._id) else: log.error("Cannot find ServiceDefinition resource for %s", service_instance.name) self.container.resource_registry.create_association( service_instance._proc_svc_id, "hasProcess", proc_id) elif service_instance._proc_type == "agent": # Registration of AGENT process: in Directory caps = service_instance.get_capabilities() self.container.directory.register( "/Agents", service_instance.id, **dict(name=service_instance._proc_name, container=service_instance.container.id, resource_id=service_instance.resource_id, agent_id=service_instance.agent_id, def_id=service_instance.agent_def_id, capabilities=caps)) # Trigger a real-time event. At this time, everything persistent has to be completed and consistent. self.container.event_pub.publish_event( event_type="ProcessLifecycleEvent", origin=service_instance.id, origin_type="ContainerProcess", sub_type="SPAWN", container_id=self.container.id, process_type=service_instance._proc_type, process_name=service_instance._proc_name, state=ProcessStateEnum.SPAWN) def terminate_process(self, process_id): """ Terminates a process and all its resources. Termination is graceful with timeout. """ log.debug("terminate_process: %s", process_id) service_instance = self.procs.get(process_id, None) if not service_instance: raise BadRequest( "Cannot terminate. Process id='%s' unknown on container id='%s'" % (process_id, self.container.id)) # Give the process notice to quit doing stuff. service_instance.quit() # Terminate IonProcessThread (may not have one, i.e. simple process) if service_instance._process: service_instance._process.notify_stop() service_instance._process.stop() self._unregister_process(process_id, service_instance) # Send out real-time notice that process was terminated. At this point, everything persistent # has to be consistent. self.container.event_pub.publish_event( event_type="ProcessLifecycleEvent", origin=service_instance.id, origin_type="ContainerProcess", sub_type="TERMINATE", container_id=self.container.id, process_type=service_instance._proc_type, process_name=service_instance._proc_name, state=ProcessStateEnum.TERMINATE) def _unregister_process(self, process_id, service_instance): # Remove process registration in resource registry if service_instance._proc_res_id: self.container.resource_registry.delete( service_instance._proc_res_id, del_associations=True) # Cleanup for specific process types if service_instance._proc_type == "service": # Check if this is the last process for this service and do auto delete service resources here svcproc_list, _ = self.container.resource_registry.find_objects( service_instance._proc_svc_id, "hasProcess", "Process", id_only=True) if not svcproc_list: self.container.resource_registry.delete( service_instance._proc_svc_id, del_associations=True) elif service_instance._proc_type == "agent": self.container.directory.unregister_safe("/Agents", service_instance.id) # Remove internal registration in container del self.procs[process_id] if service_instance._proc_name in self.procs_by_name: del self.procs_by_name[service_instance._proc_name] else: log.warn("Process name %s not in local registry", service_instance.name)