Exemplo n.º 1
0
async def get_pubkey() -> str:
    try:
        orch = Orchestrator()
        return orch.get_public_key()
    except Exception as e:
        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                            detail=str(e))
Exemplo n.º 2
0
def get_hosts() -> List[HostModel]:
    orch = Orchestrator()
    orch_hosts = orch.host_ls()
    hosts: List[HostModel] = []
    for h in orch_hosts:
        hosts.append(HostModel(hostname=h.hostname, address=h.addr))
    return hosts
Exemplo n.º 3
0
def get_devices(
    request: Request,
    _=Depends(jwt_auth_scheme)) -> Dict[str, HostsDevicesModel]:
    orch = Orchestrator(request.app.state.gstate.ceph_mgr)
    orch_devs_per_host: List[OrchDevicesPerHostModel] = orch.devices_ls()
    host_devs: Dict[str, HostsDevicesModel] = {}
    for orch_host in orch_devs_per_host:

        devices: List[DeviceModel] = []
        for dev in orch_host.devices:
            devices.append(
                DeviceModel(
                    available=dev.available,
                    device_id=dev.device_id,
                    model=dev.sys_api.model,
                    vendor=dev.sys_api.vendor,
                    human_readable_type=dev.human_readable_type,
                    size=int(dev.sys_api.size),
                    path=dev.path,
                    rejected_reasons=dev.rejected_reasons,
                ))

        host: HostsDevicesModel = HostsDevicesModel(address=orch_host.addr,
                                                    hostname=orch_host.name,
                                                    devices=devices)
        host_devs[orch_host.name] = host

    return host_devs
Exemplo n.º 4
0
async def all_devices_assimilated() -> bool:
    try:
        orch = Orchestrator()
        return orch.all_devices_assimilated()
    except Exception as e:
        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                            detail=str(e))
Exemplo n.º 5
0
def get_hosts(request: Request, _=Depends(jwt_auth_scheme)) -> List[HostModel]:
    orch = Orchestrator(request.app.state.gstate.ceph_mgr)
    orch_hosts = orch.host_ls()
    hosts: List[HostModel] = []
    for h in orch_hosts:
        hosts.append(HostModel(hostname=h.hostname, address=h.addr))
    return hosts
Exemplo n.º 6
0
    async def _handle_ready_to_add(self, conn: IncomingConnection,
                                   msg: ReadyToAddMessageModel) -> None:
        logger.debug(f"handle ready to add from {conn}")
        address: str = conn.address

        if address not in self._joining:
            logger.info(f"handle ready to add > unknown node {conn}")
            await conn.send_msg(
                MessageModel(
                    type=MessageTypeEnum.ERROR,
                    data=ErrorMessageModel(
                        what="node not joining",
                        code=status.HTTP_428_PRECONDITION_REQUIRED,
                    ),
                ))
            return

        node: JoiningNodeModel = self._joining[address]
        logger.info("handle ready to add > "
                    f"hostname: {node.hostname}, address: {node.address}")
        orch = Orchestrator(self.gstate.ceph_mgr)
        if not orch.host_add(node.hostname, node.address):
            logger.error("handle ready > failed adding host to orch")

        # reset default crush ruleset, and adjust pools to use a multi-node
        # ruleset, spreading replicas across hosts rather than osds.
        mon = self.gstate.ceph_mon
        if not mon.set_replicated_ruleset():
            logger.error(
                "handle ready to add > unable to set replicated ruleset")

        await self._set_pool_default_size()
Exemplo n.º 7
0
async def get_pubkey(request: Request, _=Depends(jwt_auth_scheme)) -> str:
    try:
        orch = Orchestrator(request.app.state.gstate.ceph_mgr)
        return orch.get_public_key()
    except Exception as e:
        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                            detail=str(e))
Exemplo n.º 8
0
def get_hosts() -> HostsReplyModel:
    orch = Orchestrator()
    orch_hosts = orch.host_ls()
    hosts: HostsReplyModel = HostsReplyModel(hosts=[])
    for h in orch_hosts:
        hosts.hosts.append(HostModel(hostname=h.hostname, address=h.addr))

    return hosts
Exemplo n.º 9
0
    async def _handle_join(self, conn: IncomingConnection,
                           msg: JoinMessageModel) -> None:
        logger.debug(f"handle join {msg}")
        assert self._state is not None

        if msg.token != self._token:
            logger.info(f"handle join > bad token from {conn}")
            await conn.send_msg(
                MessageModel(
                    type=MessageTypeEnum.ERROR,
                    data=ErrorMessageModel(what="bad token",
                                           code=status.HTTP_401_UNAUTHORIZED),
                ))
            return

        if not msg.address or not msg.hostname:
            logger.info(f"handle join > missing address or host from {conn}")
            await conn.send_msg(
                MessageModel(
                    type=MessageTypeEnum.ERROR,
                    data=ErrorMessageModel(
                        what="missing address or hostname",
                        code=status.HTTP_400_BAD_REQUEST,
                    ),
                ))
            return

        orch = Orchestrator(self.gstate.ceph_mgr)
        pubkey: str = orch.get_public_key()

        cephconf_path: Path = Path("/etc/ceph/ceph.conf")
        keyring_path: Path = Path("/etc/ceph/ceph.client.admin.keyring")
        assert cephconf_path.exists()
        assert keyring_path.exists()

        cephconf: str = cephconf_path.read_text("utf-8")
        keyring: str = keyring_path.read_text("utf-8")
        assert len(cephconf) > 0
        assert len(keyring) > 0

        logger.debug(f"handle join > pubkey: {pubkey}")

        welcome = WelcomeMessageModel(pubkey=pubkey,
                                      cephconf=cephconf,
                                      keyring=keyring)
        try:
            logger.debug(f"handle join > send welcome: {welcome}")
            await conn.send_msg(
                MessageModel(type=MessageTypeEnum.WELCOME,
                             data=welcome.dict()))
        except Exception as e:
            logger.error(f"handle join > error: {str(e)}")
            return

        logger.debug(f"handle join > welcome sent: {welcome}")
        self._joining[conn.address] = JoiningNodeModel(address=msg.address,
                                                       hostname=msg.hostname)
Exemplo n.º 10
0
async def assimilate_devices() -> bool:

    try:
        orch = Orchestrator()
        orch.assimilate_all_devices()
    except Exception as e:
        logger.error(str(e))
        return False

    return True
Exemplo n.º 11
0
def test_device_ls(
    get_data_contents: Callable[[str, str], str],
    mocker: MockerFixture,
    gstate: GlobalState,
) -> None:

    orch = Orchestrator(gstate.ceph_mgr)
    orch.call = mocker.MagicMock(return_value=json.loads(
        get_data_contents(DATA_DIR, "device_ls_not_available.json")))
    res: List[OrchDevicesPerHostModel] = orch.devices_ls()
    assert res[0].name == "asd"
Exemplo n.º 12
0
    def create(self, name: str) -> None:

        cmd = {"prefix": "fs volume create", "name": name}
        try:
            # this is expected to be a silent command
            self.mgr.call(cmd)
        except CephCommandError as e:
            raise CephFSError(e) from e

        # schedule orchestrator to update the number of mds instances
        orch = Orchestrator()
        orch.apply_mds(name)
Exemplo n.º 13
0
        async def finish_bootstrap_cb(
            success: bool, error: Optional[str]
        ) -> None:
            if not success:
                logger.error(f"bootstrap finish error: {error}")
                assert self._state.bootstrapping
                if not error:
                    error = "unable to bootstrap"
                self._state.mark_error(
                    code=DeploymentErrorEnum.CANT_BOOTSTRAP, msg=error
                )
            await post_bootstrap_cb(success, error)

            try:
                orch = Orchestrator(self._gstate.ceph_mgr)
                logger.debug("deployment > wait for host to be added")
                await asyncio.wait_for(orch.wait_host_added(hostname), 30.0)
            except TimeoutError:
                logger.error("deployment > timeout wait for host to be added")
                errmsg = "node not bootstrapped until timeout expired"
                self._state.mark_error(
                    code=DeploymentErrorEnum.CANT_BOOTSTRAP, msg=errmsg
                )
                await finisher(False, errmsg)

            try:
                await _assimilate_devices()
            except DeploymentError as e:
                logger.error("unable to assimilate devices")
                logger.exception(e)
                self._state.mark_error(
                    code=DeploymentErrorEnum.CANT_ASSIMILATE, msg=e.message
                )
                await finisher(False, e.message)
            else:
                self._progress = ProgressEnum.DONE
                await finisher(True, None)

            # By now, the KV store connection thread will have well and
            # truly found the cluster and connected to it.  Still, for the
            # sake of completeness, let's give it a kick here to make it
            # explicit.
            await self._gstate.store.ensure_connection()
Exemplo n.º 14
0
    async def _handle_ready_to_add(self, conn: IncomingConnection,
                                   msg: ReadyToAddMessageModel) -> None:
        logger.debug(f"handle ready to add from {conn}")
        address: str = conn.address

        if address not in self._joining:
            logger.info(f"handle ready to add > unknown node {conn}")
            await conn.send_msg(
                MessageModel(type=MessageTypeEnum.ERROR,
                             data=ErrorMessageModel(
                                 what="node not joining",
                                 code=status.HTTP_428_PRECONDITION_REQUIRED)))
            return

        node: JoiningNodeModel = self._joining[address]
        logger.info("handle ready to add > "
                    f"hostname: {node.hostname}, address: {node.address}")
        orch = Orchestrator()
        if not orch.host_add(node.hostname, node.address):
            logger.error("handle ready > failed adding host to orch")
Exemplo n.º 15
0
    async def probe(self) -> None:

        logger.debug("probe devices")

        orch: Orchestrator = Orchestrator(self.ceph_mgr)
        mon: Mon = self.ceph_mon
        device_lst: List[OrchDevicesPerHostModel] = orch.devices_ls()
        osd_df: CephOSDDFModel = mon.osd_df()

        if len(device_lst) == 0 or len(osd_df.nodes) == 0:
            logger.debug("probe > no devices to probe")
            return

        osds_per_host: Dict[str, List[int]] = {}
        osd_entries: Dict[int, DeviceModel] = {}
        for hostdevs in device_lst:
            host: str = hostdevs.name
            devs: List[VolumeDeviceModel] = hostdevs.devices

            osds: List[int] = []
            for dev in devs:
                if dev.available or len(dev.lvs) == 0:
                    continue

                for lv in dev.lvs:
                    if not lv.osd_id:
                        # not a ceph lv
                        continue

                    osd_entries[lv.osd_id] = DeviceModel(
                        host=host,
                        osd_id=lv.osd_id,
                        path=dev.path,
                        rotational=dev.sys_api.rotational,
                        vendor=dev.sys_api.vendor,
                        model=dev.sys_api.model,
                    )
                    osds.append(lv.osd_id)

            osds_per_host[host] = osds

        for osd in osd_df.nodes:
            if osd.id not in osd_entries:
                continue

            osd_entries[osd.id].utilization = DeviceUtilizationModel(
                total_kb=osd.kb,
                avail_kb=osd.kb_avail,
                used_kb=osd.kb_used,
                utilization=osd.utilization,
            )

        self._osds_per_host = osds_per_host
        self._osd_entries = osd_entries
Exemplo n.º 16
0
    async def _handle_join(self, conn: IncomingConnection,
                           msg: JoinMessageModel) -> None:
        logger.debug(f"handle join {msg}")
        assert self._state is not None

        if msg.token != self._token:
            logger.info(f"handle join > bad token from {conn}")
            await conn.send_msg(
                MessageModel(type=MessageTypeEnum.ERROR,
                             data=ErrorMessageModel(
                                 what="bad token",
                                 code=status.HTTP_401_UNAUTHORIZED)))
            return

        if not msg.address or not msg.hostname:
            logger.info(f"handle join > missing address or host from {conn}")
            await conn.send_msg(
                MessageModel(type=MessageTypeEnum.ERROR,
                             data=ErrorMessageModel(
                                 what="missing address or hostname",
                                 code=status.HTTP_400_BAD_REQUEST)))
            return

        orch = Orchestrator()
        pubkey: str = orch.get_public_key()

        logger.debug(f"handle join > pubkey: {pubkey}")

        welcome = WelcomeMessageModel(pubkey=pubkey)
        try:
            logger.debug(f"handle join > send welcome: {welcome}")
            await conn.send_msg(
                MessageModel(type=MessageTypeEnum.WELCOME,
                             data=welcome.dict()))
        except Exception as e:
            logger.error(f"handle join > error: {str(e)}")
            return

        logger.debug(f"handle join > welcome sent: {welcome}")
        self._joining[conn.address] = \
            JoiningNodeModel(address=msg.address, hostname=msg.hostname)
Exemplo n.º 17
0
    async def _assimilate_devices(self, hostname: str,
                                  devices: List[str]) -> None:
        try:
            orch = Orchestrator(self._gstate.ceph_mgr)
            if not orch.host_exists(hostname):
                raise DeploymentError("Host not part of cluster.")
            orch.assimilate_devices(hostname, devices)

            # wait a few seconds so the orchestrator settles down
            while not orch.devices_assimilated(hostname, devices):
                await asyncio.sleep(1.0)

        except Exception as e:
            raise DeploymentError(str(e))
Exemplo n.º 18
0
def test_devices_assimilated(
    get_data_contents: Callable[[str, str], str],
    mocker: MockerFixture,
    gstate: GlobalState,
) -> None:
    def device_ls_gen():
        raw = json.loads(
            get_data_contents(DATA_DIR, "device_ls_not_available.json"))
        devicels = parse_obj_as(List[OrchDevicesPerHostModel], raw)
        yield devicels
        devicels[0].devices[1].available = True
        yield devicels

    from gravel.controllers.orch.orchestrator import Orchestrator

    orch = Orchestrator(gstate.ceph_mgr)

    devicegen = device_ls_gen()
    orch.devices_ls = mocker.MagicMock(return_value=next(devicegen))
    assert orch.devices_assimilated("asd", ["/dev/vdb", "/dev/vdc"])

    orch.devices_ls = mocker.MagicMock(return_value=next(devicegen))
    assert not orch.devices_assimilated("asd", ["/dev/vdc"])
Exemplo n.º 19
0
 def get_target_size():
     orch: Orchestrator = Orchestrator(self.gstate.ceph_mgr)
     orch_hosts: List[OrchHostListModel] = orch.host_ls()
     return 2 if len(orch_hosts) < 3 else 3
Exemplo n.º 20
0
    async def join(
        self,
        leader_address: str,
        token: str,
        uuid: UUID,
        hostname: str,
        address: str,
        disks: DeploymentDisksConfig,
    ) -> bool:
        logger.debug(f"join > with leader {leader_address}, token: {token}")

        assert self._state
        assert hostname
        assert address

        if self._state.bootstrapping:
            raise NodeBootstrappingError()
        elif self._state.deployed:
            raise NodeHasBeenDeployedError()
        elif self._state.joining:
            raise NodeAlreadyJoiningError()
        elif self._state.ready:
            raise NodeHasJoinedError()
        assert self._state.nostage

        uri: str = f"ws://{leader_address}/api/nodes/ws"
        conn = await self._connmgr.connect(uri)
        logger.debug(f"join > conn: {conn}")

        joinmsg = JoinMessageModel(
            uuid=uuid, hostname=hostname, address=address, token=token
        )
        msg = MessageModel(type=MessageTypeEnum.JOIN, data=joinmsg.dict())
        await conn.send(msg)

        reply: MessageModel = await conn.receive()
        logger.debug(f"join > recv: {reply}")
        if reply.type == MessageTypeEnum.ERROR:
            errmsg = ErrorMessageModel.parse_obj(reply.data)
            logger.error(f"join > error: {errmsg.what}")
            await conn.close()
            self._state.mark_error(
                code=DeploymentErrorEnum.CANT_JOIN, msg=errmsg.what
            )
            return False

        assert reply.type == MessageTypeEnum.WELCOME
        welcome = WelcomeMessageModel.parse_obj(reply.data)
        assert welcome.pubkey
        assert welcome.cephconf
        assert welcome.keyring

        # create system disk after we are certain we are joining.
        # ensure all state writes happen only after the disk has been created.
        systemdisk = SystemDisk(self._gstate)
        try:
            await systemdisk.create(disks.system)
            await systemdisk.enable()
        except GravelError as e:
            raise NodeCantJoinError(e.message)

        self._state.mark_join()
        await self._set_hostname(hostname)

        authorized_keys: Path = Path("/root/.ssh/authorized_keys")
        if not authorized_keys.parent.exists():
            authorized_keys.parent.mkdir(0o700)
        with authorized_keys.open("a") as fd:
            fd.writelines([welcome.pubkey])
            logger.debug(f"join > wrote pubkey to {authorized_keys}")

        cephconf_path: Path = Path("/etc/ceph/ceph.conf")
        keyring_path: Path = Path("/etc/ceph/ceph.client.admin.keyring")
        if not cephconf_path.parent.exists():
            cephconf_path.parent.mkdir(0o755)
        cephconf_path.write_text(welcome.cephconf)
        keyring_path.write_text(welcome.keyring)
        keyring_path.chmod(0o600)
        cephconf_path.chmod(0o644)

        # We've got ceph.conf and the admin keyring now, kick the kvstore
        # to get a connection.
        await self._gstate.store.ensure_connection()

        # get NTP address
        ntp_addr = await self._gstate.store.get("/nodes/ntp_addr")
        assert ntp_addr
        await self._set_ntp_addr(ntp_addr)

        readymsg = ReadyToAddMessageModel()
        await conn.send(
            MessageModel(type=MessageTypeEnum.READY_TO_ADD, data=readymsg)
        )
        await conn.close()

        logger.debug("join > wait for host to be added")
        orch = Orchestrator(self._gstate.ceph_mgr)
        try:
            await asyncio.wait_for(orch.wait_host_added(hostname), 30.0)
        except TimeoutError:
            logger.error("join > timeout waiting for host to be added")
            raise NodeCantJoinError("host was not added to the cluster")
        logger.debug("join > host added, continue")

        try:
            await self._assimilate_devices(hostname, disks.storage)
        except DeploymentError as e:
            raise NodeCantJoinError(e.message)

        self._state.mark_ready()
        return True
Exemplo n.º 21
0
    async def join(
        self,
        leader_address: str,
        token: str,
        uuid: UUID,
        hostname: str,
        address: str,
        disks: DeploymentDisksConfig,
    ) -> bool:
        logger.debug(f"join > with leader {leader_address}, token: {token}")

        assert self._state
        assert hostname
        assert address

        if self._state.bootstrapping:
            raise NodeBootstrappingError()
        elif self._state.deployed:
            raise NodeHasBeenDeployedError()
        elif self._state.joining:
            raise NodeAlreadyJoiningError()
        elif self._state.ready:
            raise NodeHasJoinedError()
        assert self._state.nostage

        uri: str = f"ws://{leader_address}/api/nodes/ws"
        conn = await self._connmgr.connect(uri)
        logger.debug(f"join > conn: {conn}")

        joinmsg = JoinMessageModel(uuid=uuid,
                                   hostname=hostname,
                                   address=address,
                                   token=token)
        msg = MessageModel(type=MessageTypeEnum.JOIN, data=joinmsg.dict())
        await conn.send(msg)

        reply: MessageModel = await conn.receive()
        logger.debug(f"join > recv: {reply}")
        if reply.type == MessageTypeEnum.ERROR:
            errmsg = ErrorMessageModel.parse_obj(reply.data)
            logger.error(f"join > error: {errmsg.what}")
            await conn.close()
            self._state.mark_error(code=DeploymentErrorEnum.CANT_JOIN,
                                   msg=errmsg.what)
            return False

        assert reply.type == MessageTypeEnum.WELCOME
        welcome = WelcomeMessageModel.parse_obj(reply.data)
        assert welcome.pubkey
        assert welcome.cephconf
        assert welcome.keyring

        self._state.mark_join()
        await self._prepare_node(
            disks.system,
            hostname,
            ntpaddr=None,
            pubkey=welcome.pubkey,
            keyring=welcome.keyring,
            cephconf=welcome.cephconf,
            containerconf=None,
            is_join=True,
            progress_cb=None,
        )

        readymsg = ReadyToAddMessageModel()
        await conn.send(
            MessageModel(type=MessageTypeEnum.READY_TO_ADD, data=readymsg))
        await conn.close()

        logger.debug("join > wait for host to be added")
        orch = Orchestrator(self._gstate.ceph_mgr)
        try:
            await asyncio.wait_for(orch.wait_host_added(hostname), 30.0)
        except TimeoutError:
            logger.error("join > timeout waiting for host to be added")
            raise NodeCantJoinError("Host was not added to the cluster.")
        logger.debug("join > host added, continue")

        try:
            await self._assimilate_devices(hostname, disks.storage)
        except DeploymentError as e:
            raise NodeCantJoinError(e.message)

        self._state.mark_ready()
        return True