async def get_pubkey() -> str: try: orch = Orchestrator() return orch.get_public_key() except Exception as e: raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
def get_hosts() -> List[HostModel]: orch = Orchestrator() orch_hosts = orch.host_ls() hosts: List[HostModel] = [] for h in orch_hosts: hosts.append(HostModel(hostname=h.hostname, address=h.addr)) return hosts
def get_devices( request: Request, _=Depends(jwt_auth_scheme)) -> Dict[str, HostsDevicesModel]: orch = Orchestrator(request.app.state.gstate.ceph_mgr) orch_devs_per_host: List[OrchDevicesPerHostModel] = orch.devices_ls() host_devs: Dict[str, HostsDevicesModel] = {} for orch_host in orch_devs_per_host: devices: List[DeviceModel] = [] for dev in orch_host.devices: devices.append( DeviceModel( available=dev.available, device_id=dev.device_id, model=dev.sys_api.model, vendor=dev.sys_api.vendor, human_readable_type=dev.human_readable_type, size=int(dev.sys_api.size), path=dev.path, rejected_reasons=dev.rejected_reasons, )) host: HostsDevicesModel = HostsDevicesModel(address=orch_host.addr, hostname=orch_host.name, devices=devices) host_devs[orch_host.name] = host return host_devs
async def all_devices_assimilated() -> bool: try: orch = Orchestrator() return orch.all_devices_assimilated() except Exception as e: raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
def get_hosts(request: Request, _=Depends(jwt_auth_scheme)) -> List[HostModel]: orch = Orchestrator(request.app.state.gstate.ceph_mgr) orch_hosts = orch.host_ls() hosts: List[HostModel] = [] for h in orch_hosts: hosts.append(HostModel(hostname=h.hostname, address=h.addr)) return hosts
async def _handle_ready_to_add(self, conn: IncomingConnection, msg: ReadyToAddMessageModel) -> None: logger.debug(f"handle ready to add from {conn}") address: str = conn.address if address not in self._joining: logger.info(f"handle ready to add > unknown node {conn}") await conn.send_msg( MessageModel( type=MessageTypeEnum.ERROR, data=ErrorMessageModel( what="node not joining", code=status.HTTP_428_PRECONDITION_REQUIRED, ), )) return node: JoiningNodeModel = self._joining[address] logger.info("handle ready to add > " f"hostname: {node.hostname}, address: {node.address}") orch = Orchestrator(self.gstate.ceph_mgr) if not orch.host_add(node.hostname, node.address): logger.error("handle ready > failed adding host to orch") # reset default crush ruleset, and adjust pools to use a multi-node # ruleset, spreading replicas across hosts rather than osds. mon = self.gstate.ceph_mon if not mon.set_replicated_ruleset(): logger.error( "handle ready to add > unable to set replicated ruleset") await self._set_pool_default_size()
async def get_pubkey(request: Request, _=Depends(jwt_auth_scheme)) -> str: try: orch = Orchestrator(request.app.state.gstate.ceph_mgr) return orch.get_public_key() except Exception as e: raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
def get_hosts() -> HostsReplyModel: orch = Orchestrator() orch_hosts = orch.host_ls() hosts: HostsReplyModel = HostsReplyModel(hosts=[]) for h in orch_hosts: hosts.hosts.append(HostModel(hostname=h.hostname, address=h.addr)) return hosts
async def _handle_join(self, conn: IncomingConnection, msg: JoinMessageModel) -> None: logger.debug(f"handle join {msg}") assert self._state is not None if msg.token != self._token: logger.info(f"handle join > bad token from {conn}") await conn.send_msg( MessageModel( type=MessageTypeEnum.ERROR, data=ErrorMessageModel(what="bad token", code=status.HTTP_401_UNAUTHORIZED), )) return if not msg.address or not msg.hostname: logger.info(f"handle join > missing address or host from {conn}") await conn.send_msg( MessageModel( type=MessageTypeEnum.ERROR, data=ErrorMessageModel( what="missing address or hostname", code=status.HTTP_400_BAD_REQUEST, ), )) return orch = Orchestrator(self.gstate.ceph_mgr) pubkey: str = orch.get_public_key() cephconf_path: Path = Path("/etc/ceph/ceph.conf") keyring_path: Path = Path("/etc/ceph/ceph.client.admin.keyring") assert cephconf_path.exists() assert keyring_path.exists() cephconf: str = cephconf_path.read_text("utf-8") keyring: str = keyring_path.read_text("utf-8") assert len(cephconf) > 0 assert len(keyring) > 0 logger.debug(f"handle join > pubkey: {pubkey}") welcome = WelcomeMessageModel(pubkey=pubkey, cephconf=cephconf, keyring=keyring) try: logger.debug(f"handle join > send welcome: {welcome}") await conn.send_msg( MessageModel(type=MessageTypeEnum.WELCOME, data=welcome.dict())) except Exception as e: logger.error(f"handle join > error: {str(e)}") return logger.debug(f"handle join > welcome sent: {welcome}") self._joining[conn.address] = JoiningNodeModel(address=msg.address, hostname=msg.hostname)
async def assimilate_devices() -> bool: try: orch = Orchestrator() orch.assimilate_all_devices() except Exception as e: logger.error(str(e)) return False return True
def test_device_ls( get_data_contents: Callable[[str, str], str], mocker: MockerFixture, gstate: GlobalState, ) -> None: orch = Orchestrator(gstate.ceph_mgr) orch.call = mocker.MagicMock(return_value=json.loads( get_data_contents(DATA_DIR, "device_ls_not_available.json"))) res: List[OrchDevicesPerHostModel] = orch.devices_ls() assert res[0].name == "asd"
def create(self, name: str) -> None: cmd = {"prefix": "fs volume create", "name": name} try: # this is expected to be a silent command self.mgr.call(cmd) except CephCommandError as e: raise CephFSError(e) from e # schedule orchestrator to update the number of mds instances orch = Orchestrator() orch.apply_mds(name)
async def finish_bootstrap_cb( success: bool, error: Optional[str] ) -> None: if not success: logger.error(f"bootstrap finish error: {error}") assert self._state.bootstrapping if not error: error = "unable to bootstrap" self._state.mark_error( code=DeploymentErrorEnum.CANT_BOOTSTRAP, msg=error ) await post_bootstrap_cb(success, error) try: orch = Orchestrator(self._gstate.ceph_mgr) logger.debug("deployment > wait for host to be added") await asyncio.wait_for(orch.wait_host_added(hostname), 30.0) except TimeoutError: logger.error("deployment > timeout wait for host to be added") errmsg = "node not bootstrapped until timeout expired" self._state.mark_error( code=DeploymentErrorEnum.CANT_BOOTSTRAP, msg=errmsg ) await finisher(False, errmsg) try: await _assimilate_devices() except DeploymentError as e: logger.error("unable to assimilate devices") logger.exception(e) self._state.mark_error( code=DeploymentErrorEnum.CANT_ASSIMILATE, msg=e.message ) await finisher(False, e.message) else: self._progress = ProgressEnum.DONE await finisher(True, None) # By now, the KV store connection thread will have well and # truly found the cluster and connected to it. Still, for the # sake of completeness, let's give it a kick here to make it # explicit. await self._gstate.store.ensure_connection()
async def _handle_ready_to_add(self, conn: IncomingConnection, msg: ReadyToAddMessageModel) -> None: logger.debug(f"handle ready to add from {conn}") address: str = conn.address if address not in self._joining: logger.info(f"handle ready to add > unknown node {conn}") await conn.send_msg( MessageModel(type=MessageTypeEnum.ERROR, data=ErrorMessageModel( what="node not joining", code=status.HTTP_428_PRECONDITION_REQUIRED))) return node: JoiningNodeModel = self._joining[address] logger.info("handle ready to add > " f"hostname: {node.hostname}, address: {node.address}") orch = Orchestrator() if not orch.host_add(node.hostname, node.address): logger.error("handle ready > failed adding host to orch")
async def probe(self) -> None: logger.debug("probe devices") orch: Orchestrator = Orchestrator(self.ceph_mgr) mon: Mon = self.ceph_mon device_lst: List[OrchDevicesPerHostModel] = orch.devices_ls() osd_df: CephOSDDFModel = mon.osd_df() if len(device_lst) == 0 or len(osd_df.nodes) == 0: logger.debug("probe > no devices to probe") return osds_per_host: Dict[str, List[int]] = {} osd_entries: Dict[int, DeviceModel] = {} for hostdevs in device_lst: host: str = hostdevs.name devs: List[VolumeDeviceModel] = hostdevs.devices osds: List[int] = [] for dev in devs: if dev.available or len(dev.lvs) == 0: continue for lv in dev.lvs: if not lv.osd_id: # not a ceph lv continue osd_entries[lv.osd_id] = DeviceModel( host=host, osd_id=lv.osd_id, path=dev.path, rotational=dev.sys_api.rotational, vendor=dev.sys_api.vendor, model=dev.sys_api.model, ) osds.append(lv.osd_id) osds_per_host[host] = osds for osd in osd_df.nodes: if osd.id not in osd_entries: continue osd_entries[osd.id].utilization = DeviceUtilizationModel( total_kb=osd.kb, avail_kb=osd.kb_avail, used_kb=osd.kb_used, utilization=osd.utilization, ) self._osds_per_host = osds_per_host self._osd_entries = osd_entries
async def _handle_join(self, conn: IncomingConnection, msg: JoinMessageModel) -> None: logger.debug(f"handle join {msg}") assert self._state is not None if msg.token != self._token: logger.info(f"handle join > bad token from {conn}") await conn.send_msg( MessageModel(type=MessageTypeEnum.ERROR, data=ErrorMessageModel( what="bad token", code=status.HTTP_401_UNAUTHORIZED))) return if not msg.address or not msg.hostname: logger.info(f"handle join > missing address or host from {conn}") await conn.send_msg( MessageModel(type=MessageTypeEnum.ERROR, data=ErrorMessageModel( what="missing address or hostname", code=status.HTTP_400_BAD_REQUEST))) return orch = Orchestrator() pubkey: str = orch.get_public_key() logger.debug(f"handle join > pubkey: {pubkey}") welcome = WelcomeMessageModel(pubkey=pubkey) try: logger.debug(f"handle join > send welcome: {welcome}") await conn.send_msg( MessageModel(type=MessageTypeEnum.WELCOME, data=welcome.dict())) except Exception as e: logger.error(f"handle join > error: {str(e)}") return logger.debug(f"handle join > welcome sent: {welcome}") self._joining[conn.address] = \ JoiningNodeModel(address=msg.address, hostname=msg.hostname)
async def _assimilate_devices(self, hostname: str, devices: List[str]) -> None: try: orch = Orchestrator(self._gstate.ceph_mgr) if not orch.host_exists(hostname): raise DeploymentError("Host not part of cluster.") orch.assimilate_devices(hostname, devices) # wait a few seconds so the orchestrator settles down while not orch.devices_assimilated(hostname, devices): await asyncio.sleep(1.0) except Exception as e: raise DeploymentError(str(e))
def test_devices_assimilated( get_data_contents: Callable[[str, str], str], mocker: MockerFixture, gstate: GlobalState, ) -> None: def device_ls_gen(): raw = json.loads( get_data_contents(DATA_DIR, "device_ls_not_available.json")) devicels = parse_obj_as(List[OrchDevicesPerHostModel], raw) yield devicels devicels[0].devices[1].available = True yield devicels from gravel.controllers.orch.orchestrator import Orchestrator orch = Orchestrator(gstate.ceph_mgr) devicegen = device_ls_gen() orch.devices_ls = mocker.MagicMock(return_value=next(devicegen)) assert orch.devices_assimilated("asd", ["/dev/vdb", "/dev/vdc"]) orch.devices_ls = mocker.MagicMock(return_value=next(devicegen)) assert not orch.devices_assimilated("asd", ["/dev/vdc"])
def get_target_size(): orch: Orchestrator = Orchestrator(self.gstate.ceph_mgr) orch_hosts: List[OrchHostListModel] = orch.host_ls() return 2 if len(orch_hosts) < 3 else 3
async def join( self, leader_address: str, token: str, uuid: UUID, hostname: str, address: str, disks: DeploymentDisksConfig, ) -> bool: logger.debug(f"join > with leader {leader_address}, token: {token}") assert self._state assert hostname assert address if self._state.bootstrapping: raise NodeBootstrappingError() elif self._state.deployed: raise NodeHasBeenDeployedError() elif self._state.joining: raise NodeAlreadyJoiningError() elif self._state.ready: raise NodeHasJoinedError() assert self._state.nostage uri: str = f"ws://{leader_address}/api/nodes/ws" conn = await self._connmgr.connect(uri) logger.debug(f"join > conn: {conn}") joinmsg = JoinMessageModel( uuid=uuid, hostname=hostname, address=address, token=token ) msg = MessageModel(type=MessageTypeEnum.JOIN, data=joinmsg.dict()) await conn.send(msg) reply: MessageModel = await conn.receive() logger.debug(f"join > recv: {reply}") if reply.type == MessageTypeEnum.ERROR: errmsg = ErrorMessageModel.parse_obj(reply.data) logger.error(f"join > error: {errmsg.what}") await conn.close() self._state.mark_error( code=DeploymentErrorEnum.CANT_JOIN, msg=errmsg.what ) return False assert reply.type == MessageTypeEnum.WELCOME welcome = WelcomeMessageModel.parse_obj(reply.data) assert welcome.pubkey assert welcome.cephconf assert welcome.keyring # create system disk after we are certain we are joining. # ensure all state writes happen only after the disk has been created. systemdisk = SystemDisk(self._gstate) try: await systemdisk.create(disks.system) await systemdisk.enable() except GravelError as e: raise NodeCantJoinError(e.message) self._state.mark_join() await self._set_hostname(hostname) authorized_keys: Path = Path("/root/.ssh/authorized_keys") if not authorized_keys.parent.exists(): authorized_keys.parent.mkdir(0o700) with authorized_keys.open("a") as fd: fd.writelines([welcome.pubkey]) logger.debug(f"join > wrote pubkey to {authorized_keys}") cephconf_path: Path = Path("/etc/ceph/ceph.conf") keyring_path: Path = Path("/etc/ceph/ceph.client.admin.keyring") if not cephconf_path.parent.exists(): cephconf_path.parent.mkdir(0o755) cephconf_path.write_text(welcome.cephconf) keyring_path.write_text(welcome.keyring) keyring_path.chmod(0o600) cephconf_path.chmod(0o644) # We've got ceph.conf and the admin keyring now, kick the kvstore # to get a connection. await self._gstate.store.ensure_connection() # get NTP address ntp_addr = await self._gstate.store.get("/nodes/ntp_addr") assert ntp_addr await self._set_ntp_addr(ntp_addr) readymsg = ReadyToAddMessageModel() await conn.send( MessageModel(type=MessageTypeEnum.READY_TO_ADD, data=readymsg) ) await conn.close() logger.debug("join > wait for host to be added") orch = Orchestrator(self._gstate.ceph_mgr) try: await asyncio.wait_for(orch.wait_host_added(hostname), 30.0) except TimeoutError: logger.error("join > timeout waiting for host to be added") raise NodeCantJoinError("host was not added to the cluster") logger.debug("join > host added, continue") try: await self._assimilate_devices(hostname, disks.storage) except DeploymentError as e: raise NodeCantJoinError(e.message) self._state.mark_ready() return True
async def join( self, leader_address: str, token: str, uuid: UUID, hostname: str, address: str, disks: DeploymentDisksConfig, ) -> bool: logger.debug(f"join > with leader {leader_address}, token: {token}") assert self._state assert hostname assert address if self._state.bootstrapping: raise NodeBootstrappingError() elif self._state.deployed: raise NodeHasBeenDeployedError() elif self._state.joining: raise NodeAlreadyJoiningError() elif self._state.ready: raise NodeHasJoinedError() assert self._state.nostage uri: str = f"ws://{leader_address}/api/nodes/ws" conn = await self._connmgr.connect(uri) logger.debug(f"join > conn: {conn}") joinmsg = JoinMessageModel(uuid=uuid, hostname=hostname, address=address, token=token) msg = MessageModel(type=MessageTypeEnum.JOIN, data=joinmsg.dict()) await conn.send(msg) reply: MessageModel = await conn.receive() logger.debug(f"join > recv: {reply}") if reply.type == MessageTypeEnum.ERROR: errmsg = ErrorMessageModel.parse_obj(reply.data) logger.error(f"join > error: {errmsg.what}") await conn.close() self._state.mark_error(code=DeploymentErrorEnum.CANT_JOIN, msg=errmsg.what) return False assert reply.type == MessageTypeEnum.WELCOME welcome = WelcomeMessageModel.parse_obj(reply.data) assert welcome.pubkey assert welcome.cephconf assert welcome.keyring self._state.mark_join() await self._prepare_node( disks.system, hostname, ntpaddr=None, pubkey=welcome.pubkey, keyring=welcome.keyring, cephconf=welcome.cephconf, containerconf=None, is_join=True, progress_cb=None, ) readymsg = ReadyToAddMessageModel() await conn.send( MessageModel(type=MessageTypeEnum.READY_TO_ADD, data=readymsg)) await conn.close() logger.debug("join > wait for host to be added") orch = Orchestrator(self._gstate.ceph_mgr) try: await asyncio.wait_for(orch.wait_host_added(hostname), 30.0) except TimeoutError: logger.error("join > timeout waiting for host to be added") raise NodeCantJoinError("Host was not added to the cluster.") logger.debug("join > host added, continue") try: await self._assimilate_devices(hostname, disks.storage) except DeploymentError as e: raise NodeCantJoinError(e.message) self._state.mark_ready() return True