def _remove_daemon(self, name: str, host: str) -> str: """ Remove a daemon """ (daemon_type, daemon_id) = name.split('.', 1) daemon = orchestrator.DaemonDescription(daemon_type=daemon_type, daemon_id=daemon_id, hostname=host) with set_exception_subject('service', daemon.service_id(), overwrite=True): self.mgr.cephadm_services[daemon_type_to_service( daemon_type)].pre_remove(daemon) args = ['--name', name, '--force'] self.log.info('Removing daemon %s from %s' % (name, host)) out, err, code = self._run_cephadm(host, name, 'rm-daemon', args) if not code: # remove item from cache self.mgr.cache.rm_daemon(host, name) self.mgr.cache.invalidate_host_daemons(host) self.mgr.cephadm_services[daemon_type_to_service( daemon_type)].post_remove(daemon) return "Removed {} from host '{}'".format(name, host)
def _refresh_host_daemons(self, host: str) -> Optional[str]: try: out, err, code = self._run_cephadm(host, 'mon', 'ls', [], no_fsid=True) if code: return 'host %s cephadm ls returned %d: %s' % (host, code, err) ls = json.loads(''.join(out)) except ValueError: msg = 'host %s scrape failed: Cannot decode JSON' % host self.log.exception('%s: \'%s\'' % (msg, ''.join(out))) return msg except Exception as e: return 'host %s scrape failed: %s' % (host, e) dm = {} for d in ls: if not d['style'].startswith('cephadm'): continue if d['fsid'] != self.mgr._cluster_fsid: continue if '.' not in d['name']: continue sd = orchestrator.DaemonDescription() sd.last_refresh = datetime_now() for k in [ 'created', 'started', 'last_configured', 'last_deployed' ]: v = d.get(k, None) if v: setattr(sd, k, str_to_datetime(d[k])) sd.daemon_type = d['name'].split('.')[0] sd.daemon_id = '.'.join(d['name'].split('.')[1:]) sd.hostname = host sd.container_id = d.get('container_id') if sd.container_id: # shorten the hash sd.container_id = sd.container_id[0:12] sd.container_image_name = d.get('container_image_name') sd.container_image_id = d.get('container_image_id') sd.version = d.get('version') if sd.daemon_type == 'osd': sd.osdspec_affinity = self.mgr.osd_service.get_osdspec_affinity( sd.daemon_id) if 'state' in d: sd.status_desc = d['state'] sd.status = { 'running': 1, 'stopped': 0, 'error': -1, 'unknown': -1, }[d['state']] else: sd.status_desc = 'unknown' sd.status = None dm[sd.name()] = sd self.log.debug('Refreshed host %s daemons (%d)' % (host, len(dm))) self.mgr.cache.update_host_daemons(host, dm) self.mgr.cache.save_host(host) return None
def list_daemons(self, daemon_type=None, daemon_id=None, host=None, refresh=False): """ There is no guarantee which daemons are returned by describe_service, except that it returns the mgr we're running in. """ if daemon_type: daemon_types = ("mds", "osd", "mon", "rgw", "mgr", "iscsi") assert daemon_type in daemon_types, daemon_type + " unsupported" if self._daemons: if host: return list(filter(lambda svc: svc.hostname == host, self._daemons)) return self._daemons out = map(str, check_output(['ps', 'aux']).splitlines()) types = (daemon_type, ) if daemon_type else ("mds", "osd", "mon", "rgw", "mgr") assert isinstance(types, tuple) processes = [p for p in out if any([('ceph-' + t in p) for t in types])] result = [] for p in processes: sd = orchestrator.DaemonDescription() sd.hostname = 'localhost' res = re.search('ceph-[^ ]+', p) assert res sd.daemon_id = res.group() result.append(sd) return result
def _get_ceph_daemons(self): # type: () -> List[orchestrator.DaemonDescription] """ Return ceph daemons on the running host.""" types = ("mds", "osd", "mon", "rgw", "mgr", "nfs", "iscsi") out = map(str, check_output(['ps', 'aux']).splitlines()) processes = [p for p in out if any( [('ceph-{} '.format(t) in p) for t in types])] daemons = [] for p in processes: # parse daemon type m = re.search('ceph-([^ ]+)', p) if m: _daemon_type = m.group(1) else: raise AssertionError('Fail to determine daemon type from {}'.format(p)) # parse daemon ID. Possible options: `-i <id>`, `--id=<id>`, `--id <id>` patterns = [r'-i\s(\w+)', r'--id[\s=](\w+)'] for pattern in patterns: m = re.search(pattern, p) if m: daemon_id = m.group(1) break else: raise AssertionError('Fail to determine daemon ID from {}'.format(p)) daemon = orchestrator.DaemonDescription( daemon_type=_daemon_type, daemon_id=daemon_id, hostname='localhost') daemons.append(daemon) return daemons
def _refresh_host_daemons(self, host: str) -> Optional[str]: try: ls = self._run_cephadm_json(host, 'mon', 'ls', [], no_fsid=True) except OrchestratorError as e: return str(e) dm = {} for d in ls: if not d['style'].startswith('cephadm'): continue if d['fsid'] != self.mgr._cluster_fsid: continue if '.' not in d['name']: continue sd = orchestrator.DaemonDescription() sd.last_refresh = datetime_now() for k in ['created', 'started', 'last_configured', 'last_deployed']: v = d.get(k, None) if v: setattr(sd, k, str_to_datetime(d[k])) sd.daemon_type = d['name'].split('.')[0] sd.daemon_id = '.'.join(d['name'].split('.')[1:]) sd.hostname = host sd.container_id = d.get('container_id') if sd.container_id: # shorten the hash sd.container_id = sd.container_id[0:12] sd.container_image_name = d.get('container_image_name') sd.container_image_id = d.get('container_image_id') sd.container_image_digests = d.get('container_image_digests') sd.memory_usage = d.get('memory_usage') sd.memory_request = d.get('memory_request') sd.memory_limit = d.get('memory_limit') sd._service_name = d.get('service_name') sd.version = d.get('version') sd.ports = d.get('ports') sd.ip = d.get('ip') if sd.daemon_type == 'osd': sd.osdspec_affinity = self.mgr.osd_service.get_osdspec_affinity(sd.daemon_id) if 'state' in d: sd.status_desc = d['state'] sd.status = { 'running': DaemonDescriptionStatus.running, 'stopped': DaemonDescriptionStatus.stopped, 'error': DaemonDescriptionStatus.error, 'unknown': DaemonDescriptionStatus.error, }[d['state']] else: sd.status_desc = 'unknown' sd.status = None dm[sd.name()] = sd self.log.debug('Refreshed host %s daemons (%d)' % (host, len(dm))) self.mgr.cache.update_host_daemons(host, dm) self.mgr.cache.save_host(host) return None
def generate_config( self, daemon_spec: CephadmDaemonSpec ) -> Tuple[Dict[str, Any], List[str]]: assert self.TYPE == daemon_spec.daemon_type daemon_type = daemon_spec.daemon_type daemon_id = daemon_spec.daemon_id host = daemon_spec.host deps = [] # type: List[str] # find the matching NFSServiceSpec # TODO: find the spec and pass via _create_daemon instead ?? dd = orchestrator.DaemonDescription() dd.daemon_type = daemon_type dd.daemon_id = daemon_id dd.hostname = host service_name = dd.service_name() specs = self.mgr.spec_store.find(service_name) if not specs: raise OrchestratorError('Cannot find service spec %s' % (service_name)) elif len(specs) > 1: raise OrchestratorError('Found multiple service specs for %s' % (service_name)) else: # cast to keep mypy happy spec = cast(NFSServiceSpec, specs[0]) nfs = NFSGanesha(self.mgr, daemon_id, spec) # create the keyring entity = nfs.get_keyring_entity() keyring = nfs.get_or_create_keyring(entity=entity) # update the caps after get-or-create, the keyring might already exist! nfs.update_keyring_caps(entity=entity) # create the rados config object nfs.create_rados_config_obj() # generate the cephadm config cephadm_config = nfs.get_cephadm_config() cephadm_config.update( self.mgr._get_config_and_keyring(daemon_type, daemon_id, keyring=keyring, host=host)) return cephadm_config, deps
def list_daemons(self, daemon_type=None, daemon_id=None, host_name=None, refresh=False): pods = self.rook_cluster.describe_pods(daemon_type, daemon_id, host_name) result = [] for p in pods: sd = orchestrator.DaemonDescription() sd.hostname = p['hostname'] sd.container_id = p['name'] sd.daemon_type = p['labels']['app'].replace('rook-ceph-', '') status = { 'Pending': -1, 'Running': 1, 'Succeeded': 0, 'Failed': -1, 'Unknown': -1, }[p['phase']] sd.status = status sd.status_desc = p['phase'] if sd.daemon_type == "osd": sd.daemon_id = "%s" % p['labels']["ceph-osd-id"] elif sd.daemon_type == "mds": pfx = "{0}-".format(p['labels']['rook_file_system']) sd.daemon_id = p['labels']['ceph_daemon_id'].replace( pfx, '', 1) elif sd.daemon_type == "mon": sd.daemon_id = p['labels']["mon"] elif sd.daemon_type == "mgr": sd.daemon_id = p['labels']["mgr"] elif sd.daemon_type == "nfs": sd.daemon_id = p['labels']['instance'] elif sd.daemon_type == "rgw": sd.daemon_id = p['labels']['ceph_daemon_id'] else: # Unknown type -- skip it continue result.append(sd) return result
def _list_daemons( self, service_name: Optional[str] = None, daemon_type: Optional[str] = None, daemon_id: Optional[str] = None, host: Optional[str] = None, refresh: bool = False) -> List[orchestrator.DaemonDescription]: pods = self.rook_cluster.describe_pods(daemon_type, daemon_id, host) self.log.debug('pods %s' % pods) result = [] for p in pods: sd = orchestrator.DaemonDescription() sd.hostname = p['hostname'] sd.daemon_type = p['labels']['app'].replace('rook-ceph-', '') status = { 'Pending': orchestrator.DaemonDescriptionStatus.error, 'Running': orchestrator.DaemonDescriptionStatus.running, 'Succeeded': orchestrator.DaemonDescriptionStatus.stopped, 'Failed': orchestrator.DaemonDescriptionStatus.error, 'Unknown': orchestrator.DaemonDescriptionStatus.error, }[p['phase']] sd.status = status sd.status_desc = p['phase'] if 'ceph_daemon_id' in p['labels']: sd.daemon_id = p['labels']['ceph_daemon_id'] elif 'ceph-osd-id' in p['labels']: sd.daemon_id = p['labels']['ceph-osd-id'] else: # Unknown type -- skip it continue if service_name is not None and service_name != sd.service_name(): continue sd.container_image_name = p['container_image_name'] sd.container_image_id = p['container_image_id'] sd.created = p['created'] sd.last_configured = p['created'] sd.last_deployed = p['created'] sd.started = p['started'] sd.last_refresh = p['refreshed'] result.append(sd) return result
def _list_daemons(self, daemon_type=None, daemon_id=None, host=None, refresh=False): pods = self.rook_cluster.describe_pods(daemon_type, daemon_id, host) self.log.debug('pods %s' % pods) result = [] for p in pods: sd = orchestrator.DaemonDescription() sd.hostname = p['hostname'] sd.container_id = p['name'] sd.daemon_type = p['labels']['app'].replace('rook-ceph-', '') status = { 'Pending': -1, 'Running': 1, 'Succeeded': 0, 'Failed': -1, 'Unknown': -1, }[p['phase']] sd.status = status sd.status_desc = p['phase'] if 'ceph_daemon_id' in p['labels']: sd.daemon_id = p['labels']['ceph_daemon_id'] elif 'ceph-osd-id' in p['labels']: sd.daemon_id = p['labels']['ceph-osd-id'] else: # Unknown type -- skip it continue sd.container_image_name = p['container_image_name'] sd.created = p['created'] sd.last_configured = p['created'] sd.last_deployed = p['created'] sd.started = p['started'] sd.last_refresh = p['refreshed'] result.append(sd) return result
def _apply_service(self, spec: ServiceSpec) -> bool: """ Schedule a service. Deploy new daemons or remove old ones, depending on the target label and count specified in the placement. """ self.mgr.migration.verify_no_migration() daemon_type = spec.service_type service_name = spec.service_name() if spec.unmanaged: self.log.debug('Skipping unmanaged service %s' % service_name) return False if spec.preview_only: self.log.debug('Skipping preview_only service %s' % service_name) return False self.log.debug('Applying service %s spec' % service_name) config_func = self._config_fn(daemon_type) if daemon_type == 'osd': self.mgr.osd_service.create_from_spec(cast(DriveGroupSpec, spec)) # TODO: return True would result in a busy loop # can't know if daemon count changed; create_from_spec doesn't # return a solid indication return False daemons = self.mgr.cache.get_daemons_by_service(service_name) public_network = None if daemon_type == 'mon': ret, out, err = self.mgr.check_mon_command({ 'prefix': 'config get', 'who': 'mon', 'key': 'public_network', }) if '/' in out: public_network = out.strip() self.log.debug('mon public_network is %s' % public_network) def matches_network(host): # type: (str) -> bool if not public_network: return False # make sure we have 1 or more IPs for that network on that # host return len(self.mgr.cache.networks[host].get(public_network, [])) > 0 ha = HostAssignment( spec=spec, hosts=self.mgr._hosts_with_daemon_inventory(), get_daemons_func=self.mgr.cache.get_daemons_by_service, filter_new_host=matches_network if daemon_type == 'mon' else None, ) hosts: List[HostPlacementSpec] = ha.place() self.log.debug('Usable hosts: %s' % hosts) r = None # sanity check if daemon_type in ['mon', 'mgr'] and len(hosts) < 1: self.log.debug('cannot scale mon|mgr below 1 (hosts=%s)' % hosts) return False # add any? did_config = False add_daemon_hosts: Set[HostPlacementSpec] = ha.add_daemon_hosts(hosts) self.log.debug('Hosts that will receive new daemons: %s' % add_daemon_hosts) remove_daemon_hosts: Set[ orchestrator.DaemonDescription] = ha.remove_daemon_hosts(hosts) self.log.debug('Hosts that will loose daemons: %s' % remove_daemon_hosts) for host, network, name in add_daemon_hosts: daemon_id = self.mgr.get_unique_name(daemon_type, host, daemons, prefix=spec.service_id, forcename=name) if not did_config and config_func: if daemon_type == 'rgw': rgw_config_func = cast(Callable[[RGWSpec, str], None], config_func) rgw_config_func(cast(RGWSpec, spec), daemon_id) else: config_func(spec) did_config = True daemon_spec = self.mgr.cephadm_services[ daemon_type].make_daemon_spec(host, daemon_id, network, spec) self.log.debug('Placing %s.%s on host %s' % (daemon_type, daemon_id, host)) try: daemon_spec = self.mgr.cephadm_services[ daemon_type].prepare_create(daemon_spec) self.mgr._create_daemon(daemon_spec) r = True except (RuntimeError, OrchestratorError) as e: self.mgr.events.for_service( spec, 'ERROR', f"Failed while placing {daemon_type}.{daemon_id}" f"on {host}: {e}") # only return "no change" if no one else has already succeeded. # later successes will also change to True if r is None: r = False continue # add to daemon list so next name(s) will also be unique sd = orchestrator.DaemonDescription( hostname=host, daemon_type=daemon_type, daemon_id=daemon_id, ) daemons.append(sd) # remove any? def _ok_to_stop( remove_daemon_hosts: Set[orchestrator.DaemonDescription] ) -> bool: daemon_ids = [d.daemon_id for d in remove_daemon_hosts] r = self.mgr.cephadm_services[daemon_type].ok_to_stop(daemon_ids) return not r.retval while remove_daemon_hosts and not _ok_to_stop(remove_daemon_hosts): # let's find a subset that is ok-to-stop remove_daemon_hosts.pop() for d in remove_daemon_hosts: r = True # NOTE: we are passing the 'force' flag here, which means # we can delete a mon instances data. self.mgr._remove_daemon(d.name(), d.hostname) if r is None: r = False return r
def _create_daemon( self, daemon_spec: CephadmDaemonSpec, reconfig: bool = False, osd_uuid_map: Optional[Dict[str, Any]] = None, ) -> str: with set_exception_subject('service', orchestrator.DaemonDescription( daemon_type=daemon_spec.daemon_type, daemon_id=daemon_spec.daemon_id, hostname=daemon_spec.host, ).service_id(), overwrite=True): image = '' start_time = datetime_now() ports: List[int] = daemon_spec.ports if daemon_spec.ports else [] if daemon_spec.daemon_type == 'container': spec: Optional[CustomContainerSpec] = daemon_spec.spec if spec is None: # Exit here immediately because the required service # spec to create a daemon is not provided. This is only # provided when a service is applied via 'orch apply' # command. msg = "Failed to {} daemon {} on {}: Required " \ "service specification not provided".format( 'reconfigure' if reconfig else 'deploy', daemon_spec.name(), daemon_spec.host) self.log.info(msg) return msg image = spec.image if spec.ports: ports.extend(spec.ports) if daemon_spec.daemon_type == 'cephadm-exporter': if not reconfig: assert daemon_spec.host deploy_ok = self._deploy_cephadm_binary(daemon_spec.host) if not deploy_ok: msg = f"Unable to deploy the cephadm binary to {daemon_spec.host}" self.log.warning(msg) return msg if daemon_spec.daemon_type == 'haproxy': haspec = cast(HA_RGWSpec, daemon_spec.spec) if haspec.haproxy_container_image: image = haspec.haproxy_container_image if daemon_spec.daemon_type == 'keepalived': haspec = cast(HA_RGWSpec, daemon_spec.spec) if haspec.keepalived_container_image: image = haspec.keepalived_container_image cephadm_config, deps = self.mgr.cephadm_services[ daemon_type_to_service( daemon_spec.daemon_type)].generate_config(daemon_spec) # TCP port to open in the host firewall if len(ports) > 0: daemon_spec.extra_args.extend( ['--tcp-ports', ' '.join(map(str, ports))]) # osd deployments needs an --osd-uuid arg if daemon_spec.daemon_type == 'osd': if not osd_uuid_map: osd_uuid_map = self.mgr.get_osd_uuid_map() osd_uuid = osd_uuid_map.get(daemon_spec.daemon_id) if not osd_uuid: raise OrchestratorError('osd.%s not in osdmap' % daemon_spec.daemon_id) daemon_spec.extra_args.extend(['--osd-fsid', osd_uuid]) if reconfig: daemon_spec.extra_args.append('--reconfig') if self.mgr.allow_ptrace: daemon_spec.extra_args.append('--allow-ptrace') if self.mgr.cache.host_needs_registry_login( daemon_spec.host) and self.mgr.registry_url: self._registry_login(daemon_spec.host, self.mgr.registry_url, self.mgr.registry_username, self.mgr.registry_password) daemon_spec.extra_args.extend(['--config-json', '-']) self.log.info('%s daemon %s on %s' % ('Reconfiguring' if reconfig else 'Deploying', daemon_spec.name(), daemon_spec.host)) out, err, code = self._run_cephadm( daemon_spec.host, daemon_spec.name(), 'deploy', [ '--name', daemon_spec.name(), ] + daemon_spec.extra_args, stdin=json.dumps(cephadm_config), image=image) if not code and daemon_spec.host in self.mgr.cache.daemons: # prime cached service state with what we (should have) # just created sd = orchestrator.DaemonDescription() sd.daemon_type = daemon_spec.daemon_type sd.daemon_id = daemon_spec.daemon_id sd.hostname = daemon_spec.host sd.status = 1 sd.status_desc = 'starting' self.mgr.cache.add_daemon(daemon_spec.host, sd) if daemon_spec.daemon_type in [ 'grafana', 'iscsi', 'prometheus', 'alertmanager' ]: self.mgr.requires_post_actions.add(daemon_spec.daemon_type) self.mgr.cache.invalidate_host_daemons(daemon_spec.host) self.mgr.cache.update_daemon_config_deps(daemon_spec.host, daemon_spec.name(), deps, start_time) self.mgr.cache.save_host(daemon_spec.host) msg = "{} {} on host '{}'".format( 'Reconfigured' if reconfig else 'Deployed', daemon_spec.name(), daemon_spec.host) if not code: self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.INFO, msg) else: what = 'reconfigure' if reconfig else 'deploy' self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}') return msg
def _apply_service(self, spec: ServiceSpec) -> bool: """ Schedule a service. Deploy new daemons or remove old ones, depending on the target label and count specified in the placement. """ self.mgr.migration.verify_no_migration() service_type = spec.service_type service_name = spec.service_name() if spec.unmanaged: self.log.debug('Skipping unmanaged service %s' % service_name) return False if spec.preview_only: self.log.debug('Skipping preview_only service %s' % service_name) return False self.log.debug('Applying service %s spec' % service_name) if service_type == 'osd': self.mgr.osd_service.create_from_spec(cast(DriveGroupSpec, spec)) # TODO: return True would result in a busy loop # can't know if daemon count changed; create_from_spec doesn't # return a solid indication return False daemons = self.mgr.cache.get_daemons_by_service(service_name) public_network = None if service_type == 'mon': out = str(self.mgr.get_foreign_ceph_option('mon', 'public_network')) if '/' in out: public_network = out.strip() self.log.debug('mon public_network is %s' % public_network) def matches_network(host): # type: (str) -> bool if not public_network: return False # make sure we have 1 or more IPs for that network on that # host return len(self.mgr.cache.networks[host].get(public_network, [])) > 0 def virtual_ip_allowed(host): # type: (str) -> bool # Verify that it is possible to use Virtual IPs in the host try: if self.mgr.cache.facts[host]['kernel_parameters'][ 'net.ipv4.ip_nonlocal_bind'] == '0': return False except KeyError: return False return True ha = HostAssignment( spec=spec, hosts=self.mgr._hosts_with_daemon_inventory(), get_daemons_func=self.mgr.cache.get_daemons_by_service, filter_new_host=matches_network if service_type == 'mon' else virtual_ip_allowed if service_type == 'ha-rgw' else None, ) try: hosts: List[HostPlacementSpec] = ha.place() self.log.debug('Usable hosts: %s' % hosts) except OrchestratorError as e: self.log.error('Failed to apply %s spec %s: %s' % (spec.service_name(), spec, e)) self.mgr.events.for_service(spec, 'ERROR', 'Failed to apply: ' + str(e)) return False r = None # sanity check if service_type in ['mon', 'mgr'] and len(hosts) < 1: self.log.debug('cannot scale mon|mgr below 1 (hosts=%s)' % hosts) return False # add any? did_config = False add_daemon_hosts: Set[HostPlacementSpec] = ha.add_daemon_hosts(hosts) self.log.debug('Hosts that will receive new daemons: %s' % add_daemon_hosts) remove_daemon_hosts: Set[ orchestrator.DaemonDescription] = ha.remove_daemon_hosts(hosts) self.log.debug('Hosts that will loose daemons: %s' % remove_daemon_hosts) if service_type == 'ha-rgw': spec = self.update_ha_rgw_definitive_hosts(spec, hosts, add_daemon_hosts) for host, network, name in add_daemon_hosts: for daemon_type in service_to_daemon_types(service_type): daemon_id = self.mgr.get_unique_name(daemon_type, host, daemons, prefix=spec.service_id, forcename=name) if not did_config: self.mgr.cephadm_services[service_type].config( spec, daemon_id) did_config = True daemon_spec = self.mgr.cephadm_services[ service_type].make_daemon_spec(host, daemon_id, network, spec, daemon_type=daemon_type) self.log.debug('Placing %s.%s on host %s' % (daemon_type, daemon_id, host)) try: daemon_spec = self.mgr.cephadm_services[ service_type].prepare_create(daemon_spec) self._create_daemon(daemon_spec) r = True except (RuntimeError, OrchestratorError) as e: self.mgr.events.for_service( spec, 'ERROR', f"Failed while placing {daemon_type}.{daemon_id}" f"on {host}: {e}") # only return "no change" if no one else has already succeeded. # later successes will also change to True if r is None: r = False continue # add to daemon list so next name(s) will also be unique sd = orchestrator.DaemonDescription( hostname=host, daemon_type=daemon_type, daemon_id=daemon_id, ) daemons.append(sd) # remove any? def _ok_to_stop( remove_daemon_hosts: Set[orchestrator.DaemonDescription] ) -> bool: daemon_ids = [d.daemon_id for d in remove_daemon_hosts] assert None not in daemon_ids # setting force flag retains previous behavior, should revisit later. r = self.mgr.cephadm_services[service_type].ok_to_stop(cast( List[str], daemon_ids), force=True) return not r.retval while remove_daemon_hosts and not _ok_to_stop(remove_daemon_hosts): # let's find a subset that is ok-to-stop remove_daemon_hosts.pop() for d in remove_daemon_hosts: r = True # NOTE: we are passing the 'force' flag here, which means # we can delete a mon instances data. assert d.hostname is not None self._remove_daemon(d.name(), d.hostname) if r is None: r = False return r
def _create_daemon(self, daemon_spec: CephadmDaemonDeploySpec, reconfig: bool = False, osd_uuid_map: Optional[Dict[str, Any]] = None, ) -> str: with set_exception_subject('service', orchestrator.DaemonDescription( daemon_type=daemon_spec.daemon_type, daemon_id=daemon_spec.daemon_id, hostname=daemon_spec.host, ).service_id(), overwrite=True): try: image = '' start_time = datetime_now() ports: List[int] = daemon_spec.ports if daemon_spec.ports else [] if daemon_spec.daemon_type == 'container': spec = cast(CustomContainerSpec, self.mgr.spec_store[daemon_spec.service_name].spec) image = spec.image if spec.ports: ports.extend(spec.ports) if daemon_spec.daemon_type == 'cephadm-exporter': if not reconfig: assert daemon_spec.host self._deploy_cephadm_binary(daemon_spec.host) if daemon_spec.daemon_type == 'haproxy': haspec = cast(HA_RGWSpec, self.mgr.spec_store[daemon_spec.service_name].spec) if haspec.haproxy_container_image: image = haspec.haproxy_container_image if daemon_spec.daemon_type == 'keepalived': haspec = cast(HA_RGWSpec, self.mgr.spec_store[daemon_spec.service_name].spec) if haspec.keepalived_container_image: image = haspec.keepalived_container_image # TCP port to open in the host firewall if len(ports) > 0: daemon_spec.extra_args.extend([ '--tcp-ports', ' '.join(map(str, ports)) ]) # osd deployments needs an --osd-uuid arg if daemon_spec.daemon_type == 'osd': if not osd_uuid_map: osd_uuid_map = self.mgr.get_osd_uuid_map() osd_uuid = osd_uuid_map.get(daemon_spec.daemon_id) if not osd_uuid: raise OrchestratorError('osd.%s not in osdmap' % daemon_spec.daemon_id) daemon_spec.extra_args.extend(['--osd-fsid', osd_uuid]) if reconfig: daemon_spec.extra_args.append('--reconfig') if self.mgr.allow_ptrace: daemon_spec.extra_args.append('--allow-ptrace') if self.mgr.cache.host_needs_registry_login(daemon_spec.host) and self.mgr.registry_url: self._registry_login(daemon_spec.host, self.mgr.registry_url, self.mgr.registry_username, self.mgr.registry_password) self.log.info('%s daemon %s on %s' % ( 'Reconfiguring' if reconfig else 'Deploying', daemon_spec.name(), daemon_spec.host)) out, err, code = self._run_cephadm( daemon_spec.host, daemon_spec.name(), 'deploy', [ '--name', daemon_spec.name(), '--meta-json', json.dumps({ 'service_name': daemon_spec.service_name, 'ports': daemon_spec.ports, 'ip': daemon_spec.ip, }), '--config-json', '-', ] + daemon_spec.extra_args, stdin=json.dumps(daemon_spec.final_config), image=image) # refresh daemon state? (ceph daemon reconfig does not need it) if not reconfig or daemon_spec.daemon_type not in CEPH_TYPES: if not code and daemon_spec.host in self.mgr.cache.daemons: # prime cached service state with what we (should have) # just created sd = daemon_spec.to_daemon_description( DaemonDescriptionStatus.running, 'starting') self.mgr.cache.add_daemon(daemon_spec.host, sd) if daemon_spec.daemon_type in [ 'grafana', 'iscsi', 'prometheus', 'alertmanager' ]: self.mgr.requires_post_actions.add(daemon_spec.daemon_type) self.mgr.cache.invalidate_host_daemons(daemon_spec.host) self.mgr.cache.update_daemon_config_deps( daemon_spec.host, daemon_spec.name(), daemon_spec.deps, start_time) self.mgr.cache.save_host(daemon_spec.host) msg = "{} {} on host '{}'".format( 'Reconfigured' if reconfig else 'Deployed', daemon_spec.name(), daemon_spec.host) if not code: self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.INFO, msg) else: what = 'reconfigure' if reconfig else 'deploy' self.mgr.events.for_daemon( daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}') return msg except OrchestratorError: if not reconfig: # we have to clean up the daemon. E.g. keyrings. servict_type = daemon_type_to_service(daemon_spec.daemon_type) dd = daemon_spec.to_daemon_description(DaemonDescriptionStatus.error, 'failed') self.mgr.cephadm_services[servict_type].post_remove(dd) raise