def _remove_daemon(self, name: str, host: str) -> str: """ Remove a daemon """ (daemon_type, daemon_id) = name.split('.', 1) daemon = orchestrator.DaemonDescription(daemon_type=daemon_type, daemon_id=daemon_id, hostname=host) with set_exception_subject('service', daemon.service_id(), overwrite=True): self.mgr.cephadm_services[daemon_type_to_service( daemon_type)].pre_remove(daemon) args = ['--name', name, '--force'] self.log.info('Removing daemon %s from %s' % (name, host)) out, err, code = self._run_cephadm(host, name, 'rm-daemon', args) if not code: # remove item from cache self.mgr.cache.rm_daemon(host, name) self.mgr.cache.invalidate_host_daemons(host) self.mgr.cephadm_services[daemon_type_to_service( daemon_type)].post_remove(daemon) return "Removed {} from host '{}'".format(name, host)
def post_remove(self, daemon: DaemonDescription) -> None: """ Called after the daemon is removed. """ assert daemon.daemon_type is not None assert self.TYPE == daemon_type_to_service(daemon.daemon_type) logger.debug(f'Post remove daemon {self.TYPE}.{daemon.daemon_id}')
def _check_agent(self, host: str) -> bool: down = False try: assert self.mgr.cherrypy_thread assert self.mgr.cherrypy_thread.ssl_certs.get_root_cert() except Exception: self.mgr.log.debug( f'Delaying checking agent on {host} until cephadm endpoint finished creating root cert') return down if self.mgr.agent_helpers._agent_down(host): down = True try: agent = self.mgr.cache.get_daemons_by_type('agent', host=host)[0] assert agent.daemon_id is not None assert agent.hostname is not None except Exception as e: self.mgr.log.debug( f'Could not retrieve agent on host {host} from daemon cache: {e}') return down try: spec = self.mgr.spec_store.active_specs.get('agent', None) deps = self.mgr._calc_daemon_deps(spec, 'agent', agent.daemon_id) last_deps, last_config = self.mgr.agent_cache.get_agent_last_config_deps(host) if not last_config or last_deps != deps: # if root cert is the dep that changed, we must use ssh to reconfig # so it's necessary to check this one specifically root_cert_match = False try: root_cert = self.mgr.cherrypy_thread.ssl_certs.get_root_cert() if last_deps and root_cert in last_deps: root_cert_match = True except Exception: pass daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent) # we need to know the agent port to try to reconfig w/ http # otherwise there is no choice but a full ssh reconfig if host in self.mgr.agent_cache.agent_ports and root_cert_match and not down: daemon_spec = self.mgr.cephadm_services[daemon_type_to_service( daemon_spec.daemon_type)].prepare_create(daemon_spec) self.mgr.agent_helpers._request_agent_acks( hosts={daemon_spec.host}, increment=True, daemon_spec=daemon_spec, ) else: self.mgr._daemon_action(daemon_spec, action='reconfig') return down except Exception as e: self.mgr.log.debug( f'Agent on host {host} not ready to have config and deps checked: {e}') action = self.mgr.cache.get_scheduled_daemon_action(agent.hostname, agent.name()) if action: try: daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent) self.mgr._daemon_action(daemon_spec, action=action) self.mgr.cache.rm_scheduled_daemon_action(agent.hostname, agent.name()) except Exception as e: self.mgr.log.debug( f'Agent on host {host} not ready to {action}: {e}') return down
def _create_daemon( self, daemon_spec: CephadmDaemonSpec, reconfig: bool = False, osd_uuid_map: Optional[Dict[str, Any]] = None, ) -> str: with set_exception_subject('service', orchestrator.DaemonDescription( daemon_type=daemon_spec.daemon_type, daemon_id=daemon_spec.daemon_id, hostname=daemon_spec.host, ).service_id(), overwrite=True): image = '' start_time = datetime_now() ports: List[int] = daemon_spec.ports if daemon_spec.ports else [] if daemon_spec.daemon_type == 'container': spec: Optional[CustomContainerSpec] = daemon_spec.spec if spec is None: # Exit here immediately because the required service # spec to create a daemon is not provided. This is only # provided when a service is applied via 'orch apply' # command. msg = "Failed to {} daemon {} on {}: Required " \ "service specification not provided".format( 'reconfigure' if reconfig else 'deploy', daemon_spec.name(), daemon_spec.host) self.log.info(msg) return msg image = spec.image if spec.ports: ports.extend(spec.ports) if daemon_spec.daemon_type == 'cephadm-exporter': if not reconfig: assert daemon_spec.host deploy_ok = self._deploy_cephadm_binary(daemon_spec.host) if not deploy_ok: msg = f"Unable to deploy the cephadm binary to {daemon_spec.host}" self.log.warning(msg) return msg if daemon_spec.daemon_type == 'haproxy': haspec = cast(HA_RGWSpec, daemon_spec.spec) if haspec.haproxy_container_image: image = haspec.haproxy_container_image if daemon_spec.daemon_type == 'keepalived': haspec = cast(HA_RGWSpec, daemon_spec.spec) if haspec.keepalived_container_image: image = haspec.keepalived_container_image cephadm_config, deps = self.mgr.cephadm_services[ daemon_type_to_service( daemon_spec.daemon_type)].generate_config(daemon_spec) # TCP port to open in the host firewall if len(ports) > 0: daemon_spec.extra_args.extend( ['--tcp-ports', ' '.join(map(str, ports))]) # osd deployments needs an --osd-uuid arg if daemon_spec.daemon_type == 'osd': if not osd_uuid_map: osd_uuid_map = self.mgr.get_osd_uuid_map() osd_uuid = osd_uuid_map.get(daemon_spec.daemon_id) if not osd_uuid: raise OrchestratorError('osd.%s not in osdmap' % daemon_spec.daemon_id) daemon_spec.extra_args.extend(['--osd-fsid', osd_uuid]) if reconfig: daemon_spec.extra_args.append('--reconfig') if self.mgr.allow_ptrace: daemon_spec.extra_args.append('--allow-ptrace') if self.mgr.cache.host_needs_registry_login( daemon_spec.host) and self.mgr.registry_url: self._registry_login(daemon_spec.host, self.mgr.registry_url, self.mgr.registry_username, self.mgr.registry_password) daemon_spec.extra_args.extend(['--config-json', '-']) self.log.info('%s daemon %s on %s' % ('Reconfiguring' if reconfig else 'Deploying', daemon_spec.name(), daemon_spec.host)) out, err, code = self._run_cephadm( daemon_spec.host, daemon_spec.name(), 'deploy', [ '--name', daemon_spec.name(), ] + daemon_spec.extra_args, stdin=json.dumps(cephadm_config), image=image) if not code and daemon_spec.host in self.mgr.cache.daemons: # prime cached service state with what we (should have) # just created sd = orchestrator.DaemonDescription() sd.daemon_type = daemon_spec.daemon_type sd.daemon_id = daemon_spec.daemon_id sd.hostname = daemon_spec.host sd.status = 1 sd.status_desc = 'starting' self.mgr.cache.add_daemon(daemon_spec.host, sd) if daemon_spec.daemon_type in [ 'grafana', 'iscsi', 'prometheus', 'alertmanager' ]: self.mgr.requires_post_actions.add(daemon_spec.daemon_type) self.mgr.cache.invalidate_host_daemons(daemon_spec.host) self.mgr.cache.update_daemon_config_deps(daemon_spec.host, daemon_spec.name(), deps, start_time) self.mgr.cache.save_host(daemon_spec.host) msg = "{} {} on host '{}'".format( 'Reconfigured' if reconfig else 'Deployed', daemon_spec.name(), daemon_spec.host) if not code: self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.INFO, msg) else: what = 'reconfigure' if reconfig else 'deploy' self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}') return msg
def _check_daemons(self) -> None: daemons = self.mgr.cache.get_daemons() daemons_post: Dict[ str, List[orchestrator.DaemonDescription]] = defaultdict(list) for dd in daemons: # orphan? spec = self.mgr.spec_store.specs.get(dd.service_name(), None) if not spec and dd.daemon_type not in ['mon', 'mgr', 'osd']: # (mon and mgr specs should always exist; osds aren't matched # to a service spec) self.log.info('Removing orphan daemon %s...' % dd.name()) self._remove_daemon(dd.name(), dd.hostname) # ignore unmanaged services if spec and spec.unmanaged: continue # These daemon types require additional configs after creation if dd.daemon_type in [ 'grafana', 'iscsi', 'prometheus', 'alertmanager', 'nfs' ]: daemons_post[dd.daemon_type].append(dd) if self.mgr.cephadm_services[daemon_type_to_service( dd.daemon_type)].get_active_daemon( self.mgr.cache.get_daemons_by_service( dd.service_name())).daemon_id == dd.daemon_id: dd.is_active = True else: dd.is_active = False deps = self.mgr._calc_daemon_deps(dd.daemon_type, dd.daemon_id) last_deps, last_config = self.mgr.cache.get_daemon_last_config_deps( dd.hostname, dd.name()) if last_deps is None: last_deps = [] action = self.mgr.cache.get_scheduled_daemon_action( dd.hostname, dd.name()) if not last_config: self.log.info( 'Reconfiguring %s (unknown last config time)...' % (dd.name())) action = 'reconfig' elif last_deps != deps: self.log.debug('%s deps %s -> %s' % (dd.name(), last_deps, deps)) self.log.info('Reconfiguring %s (dependencies changed)...' % (dd.name())) action = 'reconfig' elif self.mgr.last_monmap and \ self.mgr.last_monmap > last_config and \ dd.daemon_type in CEPH_TYPES: self.log.info('Reconfiguring %s (monmap changed)...' % dd.name()) action = 'reconfig' elif self.mgr.extra_ceph_conf_is_newer(last_config) and \ dd.daemon_type in CEPH_TYPES: self.log.info('Reconfiguring %s (extra config changed)...' % dd.name()) action = 'reconfig' if action: if self.mgr.cache.get_scheduled_daemon_action(dd.hostname, dd.name()) == 'redeploy' \ and action == 'reconfig': action = 'redeploy' try: self.mgr._daemon_action(daemon_type=dd.daemon_type, daemon_id=dd.daemon_id, host=dd.hostname, action=action) self.mgr.cache.rm_scheduled_daemon_action( dd.hostname, dd.name()) except OrchestratorError as e: self.mgr.events.from_orch_error(e) if dd.daemon_type in daemons_post: del daemons_post[dd.daemon_type] # continue... except Exception as e: self.mgr.events.for_daemon_from_exception(dd.name(), e) if dd.daemon_type in daemons_post: del daemons_post[dd.daemon_type] # continue... # do daemon post actions for daemon_type, daemon_descs in daemons_post.items(): if daemon_type in self.mgr.requires_post_actions: self.mgr.requires_post_actions.remove(daemon_type) self.mgr._get_cephadm_service( daemon_type_to_service(daemon_type)).daemon_check_post( daemon_descs)