def _check_target_version(self, version: str) -> Optional[str]: try: (major, minor, patch) = version.split('.') assert int(minor) >= 0 # patch might be a number or {number}-g{sha1} except ValueError: return 'version must be in the form X.Y.Z (e.g., 15.2.3)' if int(major) < 15 or (int(major) == 15 and int(minor) < 2): return 'cephadm only supports octopus (15.2.0) or later' # to far a jump? current_version = self.mgr.version.split('ceph version ')[1] current_major, current_minor, current_patch = current_version.split('-')[0].split('.') if int(current_major) < int(major) - 2: return f'ceph can only upgrade 1 or 2 major versions at a time; {current_version} -> {version} is too big a jump' if int(current_major) > int(major): return f'ceph cannot downgrade major versions (from {current_version} to {version})' if int(current_major) == int(major): if int(current_minor) > int(minor): return f'ceph cannot downgrade to a {"rc" if minor == "1" else "dev"} release' # check mon min monmap = self.mgr.get("mon_map") mon_min = monmap.get("min_mon_release", 0) if mon_min < int(major) - 2: return f'min_mon_release ({mon_min}) < target {major} - 2; first complete an upgrade to an earlier release' # check osd min osdmap = self.mgr.get("osd_map") osd_min_name = osdmap.get("require_osd_release", "argonaut") osd_min = ceph_release_to_major(osd_min_name) if osd_min < int(major) - 2: return f'require_osd_release ({osd_min_name} or {osd_min}) < target {major} - 2; first complete an upgrade to an earlier release' return None
def _do_upgrade(self): # type: () -> None if not self.upgrade_state: logger.debug('_do_upgrade no state, exiting') return target_image = self.target_image target_id = self.upgrade_state.target_id target_digests = self.upgrade_state.target_digests target_version = self.upgrade_state.target_version first = False if not target_id or not target_version or not target_digests: # need to learn the container hash logger.info('Upgrade: First pull of %s' % target_image) self.upgrade_info_str = 'Doing first pull of %s image' % ( target_image) try: target_id, target_version, target_digests = CephadmServe( self.mgr)._get_container_image_info(target_image) except OrchestratorError as e: self._fail_upgrade( 'UPGRADE_FAILED_PULL', { 'severity': 'warning', 'summary': 'Upgrade: failed to pull target image', 'count': 1, 'detail': [str(e)], }) return if not target_version: self._fail_upgrade( 'UPGRADE_FAILED_PULL', { 'severity': 'warning', 'summary': 'Upgrade: failed to pull target image', 'count': 1, 'detail': ['unable to extract ceph version from container'], }) return self.upgrade_state.target_id = target_id # extract the version portion of 'ceph version {version} ({sha1})' self.upgrade_state.target_version = target_version.split(' ')[2] self.upgrade_state.target_digests = target_digests self._save_upgrade_state() target_image = self.target_image first = True if target_digests is None: target_digests = [] if target_version.startswith('ceph version '): # tolerate/fix upgrade state from older version self.upgrade_state.target_version = target_version.split(' ')[2] target_version = self.upgrade_state.target_version target_major, target_minor, target_patch = target_version.split('.') target_major_name = self.mgr.lookup_release_name(int(target_major)) if first: logger.info('Upgrade: Target is version %s (%s)' % (target_version, target_major_name)) logger.info('Upgrade: Target container is %s, digests %s' % (target_image, target_digests)) version_error = self._check_target_version(target_version) if version_error: self._fail_upgrade( 'UPGRADE_BAD_TARGET_VERSION', { 'severity': 'error', 'summary': f'Upgrade: cannot upgrade/downgrade to {target_version}', 'count': 1, 'detail': [version_error], }) return image_settings = self.get_distinct_container_image_settings() daemons = [ d for d in self.mgr.cache.get_daemons() if d.daemon_type in CEPH_UPGRADE_ORDER ] done = 0 for daemon_type in CEPH_UPGRADE_ORDER: logger.debug('Upgrade: Checking %s daemons' % daemon_type) need_upgrade_self = False need_upgrade: List[Tuple[DaemonDescription, bool]] = [] need_upgrade_deployer: List[Tuple[DaemonDescription, bool]] = [] for d in daemons: if d.daemon_type != daemon_type: continue assert d.daemon_type is not None assert d.daemon_id is not None correct_digest = False if (any(d in target_digests for d in (d.container_image_digests or [])) or d.daemon_type in MONITORING_STACK_TYPES): logger.debug('daemon %s.%s container digest correct' % (daemon_type, d.daemon_id)) correct_digest = True if any(d in target_digests for d in (d.deployed_by or [])): logger.debug( 'daemon %s.%s deployed by correct version' % (d.daemon_type, d.daemon_id)) done += 1 continue if self.mgr.daemon_is_self(d.daemon_type, d.daemon_id): logger.info('Upgrade: Need to upgrade myself (mgr.%s)' % self.mgr.get_mgr_id()) need_upgrade_self = True continue if correct_digest: logger.debug( 'daemon %s.%s not deployed by correct version' % (d.daemon_type, d.daemon_id)) need_upgrade_deployer.append((d, True)) else: logger.debug( 'daemon %s.%s not correct (%s, %s, %s)' % (daemon_type, d.daemon_id, d.container_image_name, d.container_image_digests, d.version)) need_upgrade.append((d, False)) if not need_upgrade_self: # only after the mgr itself is upgraded can we expect daemons to have # deployed_by == target_digests need_upgrade += need_upgrade_deployer # prepare filesystems for daemon upgrades? if (daemon_type == 'mds' and need_upgrade and not self._prepare_for_mds_upgrade( target_major, [d_entry[0] for d_entry in need_upgrade])): return if need_upgrade: self.upgrade_info_str = 'Currently upgrading %s daemons' % ( daemon_type) to_upgrade: List[Tuple[DaemonDescription, bool]] = [] known_ok_to_stop: List[str] = [] for d_entry in need_upgrade: d = d_entry[0] assert d.daemon_type is not None assert d.daemon_id is not None assert d.hostname is not None if not d.container_image_id: if d.container_image_name == target_image: logger.debug( 'daemon %s has unknown container_image_id but has correct image name' % (d.name())) continue if known_ok_to_stop: if d.name() in known_ok_to_stop: logger.info( f'Upgrade: {d.name()} is also safe to restart') to_upgrade.append(d_entry) continue if d.daemon_type in ['mon', 'osd', 'mds']: # NOTE: known_ok_to_stop is an output argument for # _wait_for_ok_to_stop if not self._wait_for_ok_to_stop(d, known_ok_to_stop): return to_upgrade.append(d_entry) # if we don't have a list of others to consider, stop now if not known_ok_to_stop: break num = 1 for d_entry in to_upgrade: d = d_entry[0] assert d.daemon_type is not None assert d.daemon_id is not None assert d.hostname is not None self._update_upgrade_progress(done / len(daemons)) # make sure host has latest container image out, errs, code = CephadmServe(self.mgr)._run_cephadm( d.hostname, '', 'inspect-image', [], image=target_image, no_fsid=True, error_ok=True) if code or not any(d in target_digests for d in json.loads(''.join(out)).get( 'repo_digests', [])): logger.info('Upgrade: Pulling %s on %s' % (target_image, d.hostname)) self.upgrade_info_str = 'Pulling %s image on host %s' % ( target_image, d.hostname) out, errs, code = CephadmServe(self.mgr)._run_cephadm( d.hostname, '', 'pull', [], image=target_image, no_fsid=True, error_ok=True) if code: self._fail_upgrade( 'UPGRADE_FAILED_PULL', { 'severity': 'warning', 'summary': 'Upgrade: failed to pull target image', 'count': 1, 'detail': [ 'failed to pull %s on host %s' % (target_image, d.hostname) ], }) return r = json.loads(''.join(out)) if not any(d in target_digests for d in r.get('repo_digests', [])): logger.info( 'Upgrade: image %s pull on %s got new digests %s (not %s), restarting' % (target_image, d.hostname, r['repo_digests'], target_digests)) self.upgrade_info_str = 'Image %s pull on %s got new digests %s (not %s), restarting' % ( target_image, d.hostname, r['repo_digests'], target_digests) self.upgrade_state.target_digests = r['repo_digests'] self._save_upgrade_state() return self.upgrade_info_str = 'Currently upgrading %s daemons' % ( daemon_type) if len(to_upgrade) > 1: logger.info( 'Upgrade: Updating %s.%s (%d/%d)' % (d.daemon_type, d.daemon_id, num, len(to_upgrade))) else: logger.info('Upgrade: Updating %s.%s' % (d.daemon_type, d.daemon_id)) action = 'Upgrading' if not d_entry[1] else 'Redeploying' try: daemon_spec = CephadmDaemonDeploySpec.from_daemon_description( d) self.mgr._daemon_action( daemon_spec, 'redeploy', image=target_image if not d_entry[1] else None) except Exception as e: self._fail_upgrade( 'UPGRADE_REDEPLOY_DAEMON', { 'severity': 'warning', 'summary': f'{action} daemon {d.name()} on host {d.hostname} failed.', 'count': 1, 'detail': [f'Upgrade daemon: {d.name()}: {e}'], }) return num += 1 if to_upgrade: return # complete mon upgrade? if daemon_type == 'mon': if not self.mgr.get("have_local_config_map"): logger.info( 'Upgrade: Restarting mgr now that mons are running pacific' ) need_upgrade_self = True if need_upgrade_self: try: self.mgr.mgr_service.fail_over() except OrchestratorError as e: self._fail_upgrade( 'UPGRADE_NO_STANDBY_MGR', { 'severity': 'warning', 'summary': f'Upgrade: {e}', 'count': 1, 'detail': [ 'The upgrade process needs to upgrade the mgr, ' 'but it needs at least one standby to proceed.', ], }) return return # unreachable code, as fail_over never returns elif daemon_type == 'mgr': if 'UPGRADE_NO_STANDBY_MGR' in self.mgr.health_checks: del self.mgr.health_checks['UPGRADE_NO_STANDBY_MGR'] self.mgr.set_health_checks(self.mgr.health_checks) # make sure 'ceph versions' agrees ret, out_ver, err = self.mgr.check_mon_command({ 'prefix': 'versions', }) j = json.loads(out_ver) for version, count in j.get(daemon_type, {}).items(): short_version = version.split(' ')[2] if short_version != target_version: logger.warning( 'Upgrade: %d %s daemon(s) are %s != target %s' % (count, daemon_type, short_version, target_version)) # push down configs daemon_type_section = name_to_config_section(daemon_type) if image_settings.get(daemon_type_section) != target_image: logger.info('Upgrade: Setting container_image for all %s' % daemon_type) self.mgr.set_container_image(daemon_type_section, target_image) to_clean = [] for section in image_settings.keys(): if section.startswith( name_to_config_section(daemon_type) + '.'): to_clean.append(section) if to_clean: logger.debug('Upgrade: Cleaning up container_image for %s' % to_clean) for section in to_clean: ret, image, err = self.mgr.check_mon_command({ 'prefix': 'config rm', 'name': 'container_image', 'who': section, }) logger.debug('Upgrade: All %s daemons are up to date.' % daemon_type) # complete osd upgrade? if daemon_type == 'osd': osdmap = self.mgr.get("osd_map") osd_min_name = osdmap.get("require_osd_release", "argonaut") osd_min = ceph_release_to_major(osd_min_name) if osd_min < int(target_major): logger.info( f'Upgrade: Setting require_osd_release to {target_major} {target_major_name}' ) ret, _, err = self.mgr.check_mon_command({ 'prefix': 'osd require-osd-release', 'release': target_major_name, }) # complete mds upgrade? if daemon_type == 'mds' and self.upgrade_state.fs_original_max_mds: for i in self.mgr.get("fs_map")['filesystems']: fs_id = i["id"] fs_name = i['mdsmap']['fs_name'] new_max = self.upgrade_state.fs_original_max_mds.get(fs_id) if new_max: self.mgr.log.info( 'Upgrade: Scaling up filesystem %s max_mds to %d' % (fs_name, new_max)) ret, _, err = self.mgr.check_mon_command({ 'prefix': 'fs set', 'fs_name': fs_name, 'var': 'max_mds', 'val': str(new_max), }) self.upgrade_state.fs_original_max_mds = {} self._save_upgrade_state() # clean up logger.info('Upgrade: Finalizing container_image settings') self.mgr.set_container_image('global', target_image) for daemon_type in CEPH_UPGRADE_ORDER: ret, image, err = self.mgr.check_mon_command({ 'prefix': 'config rm', 'name': 'container_image', 'who': name_to_config_section(daemon_type), }) logger.info('Upgrade: Complete!') if self.upgrade_state.progress_id: self.mgr.remote('progress', 'complete', self.upgrade_state.progress_id) self.upgrade_state = None self._save_upgrade_state() return