示例#1
0
class ConfigServer:
    def __init__(self, app_ctx: Mapping[str, Any], etcd_addr: HostPortPair,
                 etcd_user: Optional[str], etcd_password: Optional[str],
                 namespace: str) -> None:
        # WARNING: importing etcd3/grpc must be done after forks.
        from ai.backend.common.etcd import AsyncEtcd
        self.context = app_ctx
        credentials = None
        if etcd_user:
            credentials = {
                'user': etcd_user,
                'password': etcd_password,
            }
        scope_prefix_map = {
            ConfigScopes.GLOBAL: '',
            # TODO: provide a way to specify other scope prefixes
        }
        self.etcd = AsyncEtcd(etcd_addr,
                              namespace,
                              scope_prefix_map,
                              credentials=credentials)

    async def close(self) -> None:
        await self.etcd.close()

    async def get(self, key: str, allow_null: bool = True) -> Optional[str]:
        value = await self.etcd.get(key)
        if value is None:
            value = config_defaults.get(key, None)
        if not allow_null and value is None:
            raise ServerMisconfiguredError(
                'A required etcd config is missing.', key)
        return value

    async def register_myself(self) -> None:
        instance_id = await get_instance_id()
        manager_info = {
            f'nodes/manager/{instance_id}': 'up',
        }
        await self.etcd.put_dict(manager_info)

    async def deregister_myself(self) -> None:
        instance_id = await get_instance_id()
        await self.etcd.delete_prefix(f'nodes/manager/{instance_id}')

    async def update_aliases_from_file(self, file: Path) -> None:
        log.info('Updating image aliases from "{0}"', file)
        try:
            data = yaml.load(open(file, 'r', encoding='utf-8'))
        except IOError:
            log.error('Cannot open "{0}".', file)
            return
        for item in data['aliases']:
            alias = item[0]
            target = item[1]
            await self.etcd.put(f'images/_aliases/{etcd_quote(alias)}', target)
            print(f'{alias} -> {target}')
        log.info('Done.')

    async def _scan_reverse_aliases(self) -> Mapping[str, List[str]]:
        aliases = await self.etcd.get_prefix('images/_aliases')
        result: DefaultDict[str, List[str]] = defaultdict(list)
        for key, value in aliases.items():
            result[value].append(etcd_unquote(key))
        return dict(result)

    async def _parse_image(self, image_ref, item, reverse_aliases):
        installed = (await self.context['redis_image'].scard(
            image_ref.canonical)) > 0
        installed_agents = await self.context['redis_image'].smembers(
            image_ref.canonical)

        res_limits = []
        for slot_key, slot_range in item['resource'].items():
            min_value = slot_range.get('min')
            if min_value is None:
                min_value = Decimal(0)
            max_value = slot_range.get('max')
            if max_value is None:
                max_value = Decimal('Infinity')
            res_limits.append({
                'key': slot_key,
                'min': min_value,
                'max': max_value,
            })

        accels = item.get('accelerators')
        if accels is None:
            accels = []
        else:
            accels = accels.split(',')

        return {
            'name': image_ref.name,
            'humanized_name': image_ref.name,  # TODO: implement
            'tag': image_ref.tag,
            'registry': image_ref.registry,
            'digest': item[''],
            'labels': item.get('labels', {}),
            'aliases': reverse_aliases.get(image_ref.canonical, []),
            'size_bytes': item.get('size_bytes', 0),
            'resource_limits': res_limits,
            'supported_accelerators': accels,
            'installed': installed,
            'installed_agents': installed_agents,
        }

    async def _check_image(self, reference: str) -> ImageRef:
        known_registries = await get_known_registries(self.etcd)
        ref = ImageRef(reference, known_registries)
        digest = await self.etcd.get(ref.tag_path)
        if digest is None:
            raise UnknownImageReference(reference)
        return ref

    async def inspect_image(
            self, reference: Union[str, ImageRef]) -> Mapping[str, Any]:
        if isinstance(reference, str):
            ref = await ImageRef.resolve_alias(reference, self.etcd)
        else:
            ref = reference
        reverse_aliases = await self._scan_reverse_aliases()
        image_info = await self.etcd.get_prefix(ref.tag_path)
        if not image_info:
            raise UnknownImageReference(reference)
        return await self._parse_image(ref, image_info, reverse_aliases)

    async def forget_image(self, reference: Union[str, ImageRef]) -> None:
        if isinstance(reference, str):
            ref = await ImageRef.resolve_alias(reference, self.etcd)
        else:
            ref = reference
        await self.etcd.delete_prefix(ref.tag_path)

    async def list_images(self) -> Sequence[Mapping[str, Any]]:
        known_registries = await get_known_registries(self.etcd)
        reverse_aliases = await self._scan_reverse_aliases()
        data = await self.etcd.get_prefix('images')
        coros = []
        for registry, images in data.items():
            if registry == '_aliases':
                continue
            for image, tags in images.items():
                if image == '':
                    continue
                if tags == '1':
                    continue
                for tag, image_info in tags.items():
                    if tag == '':
                        continue
                    raw_ref = f'{etcd_unquote(registry)}/{etcd_unquote(image)}:{tag}'
                    ref = ImageRef(raw_ref, known_registries)
                    coros.append(
                        self._parse_image(ref, image_info, reverse_aliases))
        result = await asyncio.gather(*coros)
        return result

    async def set_image_resource_limit(self, reference: str, slot_type: str,
                                       value_range: Tuple[Optional[Decimal],
                                                          Optional[Decimal]]):
        ref = await self._check_image(reference)
        if value_range[0] is not None:
            await self.etcd.put(f'{ref.tag_path}/resource/{slot_type}/min',
                                str(value_range[0]))
        if value_range[1] is not None:
            await self.etcd.put(f'{ref.tag_path}/resource/{slot_type}/max',
                                str(value_range[1]))

    async def _rescan_images_single_registry(
        self,
        registry_name: str,
        registry_info: Mapping[str, str],
        reporter: ProgressReporter = None,
    ) -> None:
        all_updates = {}
        base_hdrs = {
            'Accept': 'application/vnd.docker.distribution.manifest.v2+json',
        }
        registry_url = yarl.URL(registry_info[''])
        registry_type = registry_info.get('type', 'docker')
        registry_project = registry_info.get('project')
        credentials = {}
        username = registry_info.get('username')
        if username is not None:
            credentials['username'] = username
        password = registry_info.get('password')
        if password is not None:
            credentials['password'] = password

        non_kernel_words = (
            'common-',
            'commons-',
            'base-',
            'krunner',
            'builder',
            'backendai',
            'geofront',
        )

        async def _scan_image(sess, image):
            rqst_args = await registry_login(sess, registry_url, credentials,
                                             f'repository:{image}:pull')
            rqst_args['headers'].update(**base_hdrs)
            tags = []
            tag_list_url = (registry_url / f'v2/{image}/tags/list').with_query(
                {'n': '10'}, )
            while tag_list_url is not None:
                async with sess.get(tag_list_url, **rqst_args) as resp:
                    data = json.loads(await resp.read())
                    if 'tags' in data:
                        # sometimes there are dangling image names in the hub.
                        tags.extend(data['tags'])
                    tag_list_url = None
                    next_page_link = resp.links.get('next')
                    if next_page_link:
                        next_page_url = next_page_link['url']
                        tag_list_url = (registry_url.with_path(
                            next_page_url.path).with_query(
                                next_page_url.query))
            scheduler = await aiojobs.create_scheduler(limit=4)
            try:
                if reporter:
                    reporter.total_progress += len(tags)
                jobs = await asyncio.gather(*[
                    scheduler.spawn(_scan_tag(sess, rqst_args, image, tag))
                    for tag in tags
                ])
                await asyncio.gather(*[job.wait() for job in jobs])
            finally:
                await scheduler.close()

        async def _scan_tag(sess, rqst_args, image, tag):
            config_digest = None
            labels = {}
            skip_reason = None

            try:
                async with sess.get(
                        registry_url / f'v2/{image}/manifests/{tag}',
                        **rqst_args) as resp:
                    if resp.status == 404:
                        # ignore missing tags
                        # (may occur after deleting an image from the docker hub)
                        skip_reason = "missing/deleted"
                        return
                    resp.raise_for_status()
                    data = await resp.json()
                    config_digest = data['config']['digest']
                    size_bytes = (sum(layer['size']
                                      for layer in data['layers']) +
                                  data['config']['size'])

                async with sess.get(
                        registry_url / f'v2/{image}/blobs/{config_digest}',
                        **rqst_args) as resp:
                    # content-type may not be json...
                    resp.raise_for_status()
                    data = json.loads(await resp.read())
                    if 'container_config' in data:
                        raw_labels = data['container_config']['Labels']
                        if raw_labels:
                            labels.update(raw_labels)
                    else:
                        raw_labels = data['config']['Labels']
                        if raw_labels:
                            labels.update(raw_labels)
                if 'ai.backend.kernelspec' not in labels:
                    # Skip non-Backend.AI kernel images
                    skip_reason = "missing kernelspec"
                    return
                if not (MIN_KERNELSPEC <= int(labels['ai.backend.kernelspec'])
                        <= MAX_KERNELSPEC):
                    # Skip unsupported kernelspec images
                    skip_reason = "unsupported kernelspec"
                    return

                updates = {}
                updates[f'images/{etcd_quote(registry_name)}/'
                        f'{etcd_quote(image)}'] = '1'
                tag_prefix = f'images/{etcd_quote(registry_name)}/' \
                             f'{etcd_quote(image)}/{tag}'
                updates[tag_prefix] = config_digest
                updates[f'{tag_prefix}/size_bytes'] = size_bytes
                for k, v in labels.items():
                    updates[f'{tag_prefix}/labels/{k}'] = v

                accels = labels.get('ai.backend.accelerators')
                if accels:
                    updates[f'{tag_prefix}/accels'] = accels

                res_prefix = 'ai.backend.resource.min.'
                for k, v in filter(lambda pair: pair[0].startswith(res_prefix),
                                   labels.items()):
                    res_key = k[len(res_prefix):]
                    updates[f'{tag_prefix}/resource/{res_key}/min'] = v
                all_updates.update(updates)
            finally:
                if skip_reason:
                    log.warning('Skipped image - {}:{} ({})', image, tag,
                                skip_reason)
                    progress_msg = f"Skipped {image}:{tag} ({skip_reason})"
                else:
                    log.info('Updated image - {0}:{1}', image, tag)
                    progress_msg = f"Updated {image}:{tag}"
                if reporter:
                    await reporter.update(1, message=progress_msg)

        ssl_ctx = None  # default
        app_config = self.context.get('config')
        if app_config is not None and not app_config['docker-registry'][
                'ssl-verify']:
            ssl_ctx = False
        connector = aiohttp.TCPConnector(ssl=ssl_ctx)
        async with aiohttp.ClientSession(connector=connector) as sess:
            images: List[str] = []
            if registry_url.host is not None and registry_url.host.endswith(
                    '.docker.io'):
                # We need some special treatment for the Docker Hub.
                params = {'page_size': '30'}
                username = await self.etcd.get(
                    f'config/docker/registry/{etcd_quote(registry_name)}/username'
                )
                hub_url = yarl.URL('https://hub.docker.com')
                repo_list_url: Optional[yarl.URL]
                repo_list_url = hub_url / f'v2/repositories/{username}/'
                while repo_list_url is not None:
                    async with sess.get(repo_list_url, params=params) as resp:
                        if resp.status == 200:
                            data = await resp.json()
                            images.extend(
                                f"{username}/{item['name']}"
                                for item in data['results']
                                # a little optimization to ignore legacies
                                if not item['name'].startswith('kernel-'))
                        else:
                            log.error(
                                'Failed to fetch repository list from {0} '
                                '(status={1})', repo_list_url, resp.status)
                            break
                    repo_list_url = None
                    next_page_link = data.get('next', None)
                    if next_page_link:
                        next_page_url = yarl.URL(next_page_link)
                        repo_list_url = (hub_url.with_path(
                            next_page_url.path).with_query(
                                next_page_url.query))
            elif registry_type == 'docker':
                # In other cases, try the catalog search.
                rqst_args = await registry_login(sess, registry_url,
                                                 credentials,
                                                 'registry:catalog:*')
                catalog_url: Optional[yarl.URL]
                catalog_url = (registry_url / 'v2/_catalog').with_query(
                    {'n': '30'})
                while catalog_url is not None:
                    async with sess.get(catalog_url, **rqst_args) as resp:
                        if resp.status == 200:
                            data = json.loads(await resp.read())
                            images.extend(data['repositories'])
                            log.debug('found {} repositories', len(images))
                        else:
                            log.warning(
                                'Docker registry {0} does not allow/support '
                                'catalog search. (status={1})', registry_url,
                                resp.status)
                            break
                        catalog_url = None
                        next_page_link = resp.links.get('next')
                        if next_page_link:
                            next_page_url = next_page_link['url']
                            catalog_url = (registry_url.with_path(
                                next_page_url.path).with_query(
                                    next_page_url.query))
            elif registry_type == 'harbor':
                if credentials:
                    rqst_args = {
                        'auth':
                        aiohttp.BasicAuth(credentials['username'],
                                          credentials['password'])
                    }
                else:
                    rqst_args = {}
                project_list_url: Optional[yarl.URL]
                project_list_url = (registry_url / 'api/projects').with_query(
                    {'page_size': '30'})
                project_id = None
                while project_list_url is not None:
                    async with sess.get(project_list_url, **rqst_args) as resp:
                        projects = await resp.json()
                        for item in projects:
                            if item['name'] == registry_project:
                                project_id = item['project_id']
                                break
                        project_list_url = None
                        next_page_link = resp.links.get('next')
                        if next_page_link:
                            next_page_url = next_page_link['url']
                            project_list_url = (registry_url.with_path(
                                next_page_url.path).with_query(
                                    next_page_url.query))
                if project_id is None:
                    log.warning('There is no given project.')
                    return
                repo_list_url = (registry_url / 'api/repositories').with_query(
                    {
                        'project_id': project_id,
                        'page_size': '30'
                    })
                while repo_list_url is not None:
                    async with sess.get(repo_list_url, **rqst_args) as resp:
                        items = await resp.json()
                        repos = [item['name'] for item in items]
                        images.extend(repos)
                        repo_list_url = None
                        next_page_link = resp.links.get('next')
                        if next_page_link:
                            next_page_url = next_page_link['url']
                            repo_list_url = (registry_url.with_path(
                                next_page_url.path).with_query(
                                    next_page_url.query))
            else:
                log.error('Unsupported registry type')
                return

            scheduler = await aiojobs.create_scheduler(limit=4)
            try:
                spawn_tasks = [
                    scheduler.spawn(_scan_image(sess, image))
                    for image in images if not any(
                        (w in image)
                        for w in non_kernel_words)  # skip non-kernel images
                ]
                if spawn_tasks:
                    fetch_jobs = await asyncio.gather(*spawn_tasks)
                    await asyncio.gather(*[job.wait() for job in fetch_jobs])
            finally:
                await scheduler.close()

        if not all_updates:
            log.info('No images found in registry {0}', registry_url)
            return
        for kvlist in chunked(sorted(all_updates.items()), 16):
            await self.etcd.put_dict(dict(kvlist))

    async def rescan_images(self,
                            registry: str = None,
                            *,
                            reporter: ProgressReporter = None) -> None:
        if registry is None:
            registries = []
            data = await self.etcd.get_prefix('config/docker/registry')
            for key, val in data.items():
                if key:
                    registries.append(etcd_unquote(key))
        else:
            registries = [registry]
        coros = []
        for registry in registries:
            log.info('Scanning kernel images from the registry "{0}"',
                     registry)
            registry_info = await self.etcd.get_prefix(
                f'config/docker/registry/{etcd_quote(registry)}')
            if not registry_info:
                log.error('Unknown registry: "{0}"', registry)
                continue
            coros.append(
                self._rescan_images_single_registry(registry, registry_info,
                                                    reporter))
        await asyncio.gather(*coros)
        # TODO: delete images removed from registry?

    async def alias(self, alias: str, target: str) -> None:
        await self.etcd.put(f'images/_aliases/{etcd_quote(alias)}', target)

    async def dealias(self, alias: str) -> None:
        await self.etcd.delete(f'images/_aliases/{etcd_quote(alias)}')

    async def update_volumes_from_file(self, file: Path) -> None:
        log.info('Updating network volumes from "{0}"', file)
        try:
            data = yaml.load(open(file, 'r', encoding='utf-8'))
        except IOError:
            log.error('Cannot open "{0}".', file)
            return
        for item in data['volumes']:
            name = item['name']
            updates = {
                f'volumes/{name}/mount/{k}': v
                for k, v in item['mount'].items()
            }
            await self.etcd.put_dict(updates)
        log.info('done')

    async def update_resource_slots(
        self,
        slot_key_and_units: Mapping[SlotName, SlotTypes],
    ) -> None:
        updates = {}
        known_slots = await self.get_resource_slots()
        for k, v in slot_key_and_units.items():
            if k not in known_slots or v != known_slots[k]:
                updates[f'config/resource_slots/{k}'] = v.value
        if updates:
            await self.etcd.put_dict(updates)

    async def update_manager_status(self, status) -> None:
        await self.etcd.put('manager/status', status.value)
        self.get_manager_status.cache_clear()

    @aiotools.lru_cache(maxsize=1, expire_after=2.0)
    async def _get_resource_slots(self):
        raw_data = await self.etcd.get_prefix_dict('config/resource_slots')
        return {SlotName(k): SlotTypes(v) for k, v in raw_data.items()}

    async def get_resource_slots(self) -> Mapping[SlotName, SlotTypes]:
        '''
        Returns the system-wide known resource slots and their units.
        '''
        try:
            ret = current_resource_slots.get()
        except LookupError:
            configured_slots = await self._get_resource_slots()
            ret = {**INTRINSIC_SLOTS, **configured_slots}
            current_resource_slots.set(ret)
        return ret

    @aiotools.lru_cache(maxsize=1, expire_after=2.0)
    async def _get_vfolder_types(self):
        return await self.etcd.get_prefix_dict('volumes/_types')

    async def get_vfolder_types(self) -> Sequence[str]:
        '''
        Returns the vfolder types currently set. One of "user" and/or "group".
        If none is specified, "user" type is implicitly assumed.
        '''
        try:
            ret = current_vfolder_types.get()
        except LookupError:
            vf_types = await self._get_vfolder_types()
            if not vf_types:
                vf_types = {'user': ''}
            ret = list(vf_types.keys())
            current_vfolder_types.set(ret)
        return ret

    @aiotools.lru_cache(maxsize=1, expire_after=5.0)
    async def get_manager_nodes_info(self):
        return await self.etcd.get_prefix_dict('nodes/manager')

    @aiotools.lru_cache(maxsize=1, expire_after=2.0)
    async def get_manager_status(self):
        status = await self.etcd.get('manager/status')
        if status is None:
            return None
        return ManagerStatus(status)

    async def watch_manager_status(self):
        async with aiotools.aclosing(
                self.etcd.watch('manager/status')) as agen:
            async for ev in agen:
                yield ev

    # TODO: refactor using contextvars in Python 3.7 so that the result is cached
    #       in a per-request basis.
    @aiotools.lru_cache(maxsize=1, expire_after=2.0)
    async def get_allowed_origins(self):
        return await self.get('config/api/allow-origins')

    # TODO: refactor using contextvars in Python 3.7 so that the result is cached
    #       in a per-request basis.
    @aiotools.lru_cache(expire_after=60.0)
    async def get_image_slot_ranges(self, image_ref: ImageRef):
        '''
        Returns the minimum and maximum ResourceSlot values.
        All slot values are converted and normalized to Decimal.
        '''
        data = await self.etcd.get_prefix_dict(image_ref.tag_path)
        slot_units = await self.get_resource_slots()
        min_slot = ResourceSlot()
        max_slot = ResourceSlot()

        for slot_key, slot_range in data['resource'].items():
            slot_unit = slot_units.get(slot_key)
            if slot_unit is None:
                # ignore unknown slots
                continue
            min_value = slot_range.get('min')
            if min_value is None:
                min_value = Decimal(0)
            max_value = slot_range.get('max')
            if max_value is None:
                max_value = Decimal('Infinity')
            if slot_unit == 'bytes':
                if not isinstance(min_value, Decimal):
                    min_value = BinarySize.from_str(min_value)
                if not isinstance(max_value, Decimal):
                    max_value = BinarySize.from_str(max_value)
            else:
                if not isinstance(min_value, Decimal):
                    min_value = Decimal(min_value)
                if not isinstance(max_value, Decimal):
                    max_value = Decimal(max_value)
            min_slot[slot_key] = min_value
            max_slot[slot_key] = max_value

        # fill missing
        for slot_key in slot_units.keys():
            if slot_key not in min_slot:
                min_slot[slot_key] = Decimal(0)
            if slot_key not in max_slot:
                max_slot[slot_key] = Decimal('Infinity')

        return min_slot, max_slot
示例#2
0
class SharedConfig(AbstractConfig):
    def __init__(self, app_ctx: Mapping[str, Any], etcd_addr: HostPortPair,
                 etcd_user: Optional[str], etcd_password: Optional[str],
                 namespace: str) -> None:
        # WARNING: importing etcd3/grpc must be done after forks.
        super().__init__()
        self.context = app_ctx
        credentials = None
        if etcd_user:
            credentials = {
                'user': etcd_user,
                'password': etcd_password,
            }
        scope_prefix_map = {
            ConfigScopes.GLOBAL: '',
            # TODO: provide a way to specify other scope prefixes
        }
        self.etcd = AsyncEtcd(etcd_addr,
                              namespace,
                              scope_prefix_map,
                              credentials=credentials)

    async def close(self) -> None:
        await self.etcd.close()

    async def reload(self) -> None:
        raw_cfg = await self.etcd.get_prefix('config')
        try:
            cfg = shared_config_iv.check(raw_cfg)
        except config.ConfigurationError as e:
            print('Validation of shared etcd configuration has failed:',
                  file=sys.stderr)
            print(pformat(e.invalid_data), file=sys.stderr)
            raise click.Abort()
        else:
            self.data = cfg

    def __hash__(self) -> int:
        # When used as a key in dicts, we don't care our contents.
        # Just treat it lke an opaque object.
        return hash(id(self))

    async def get_raw(self,
                      key: str,
                      allow_null: bool = True) -> Optional[str]:
        value = await self.etcd.get(key)
        if value is None:
            value = shared_config_defaults.get(key, None)
        if not allow_null and value is None:
            raise ServerMisconfiguredError(
                'A required etcd config is missing.', key)
        return value

    async def register_myself(self) -> None:
        instance_id = await get_instance_id()
        manager_info = {
            f'nodes/manager/{instance_id}': 'up',
        }
        await self.etcd.put_dict(manager_info)

    async def deregister_myself(self) -> None:
        instance_id = await get_instance_id()
        await self.etcd.delete_prefix(f'nodes/manager/{instance_id}')

    async def update_aliases_from_file(self, file: Path) -> None:
        log.info('Updating image aliases from "{0}"', file)
        try:
            data = yaml.load(open(file, 'r', encoding='utf-8'))
        except IOError:
            log.error('Cannot open "{0}".', file)
            return
        for item in data['aliases']:
            alias = item[0]
            target = item[1]
            await self.etcd.put(f'images/_aliases/{etcd_quote(alias)}', target)
            print(f'{alias} -> {target}')
        log.info('Done.')

    async def _scan_reverse_aliases(self) -> Mapping[str, List[str]]:
        aliases = await self.etcd.get_prefix('images/_aliases')
        result: DefaultDict[str, List[str]] = defaultdict(list)
        for key, value in aliases.items():
            result[value].append(etcd_unquote(key))
        return dict(result)

    async def _parse_image(self, image_ref, item, reverse_aliases):
        installed = (await self.context['redis_image'].scard(
            image_ref.canonical)) > 0
        installed_agents = await self.context['redis_image'].smembers(
            image_ref.canonical)

        res_limits = []
        for slot_key, slot_range in item['resource'].items():
            min_value = slot_range.get('min')
            if min_value is None:
                min_value = Decimal(0)
            max_value = slot_range.get('max')
            if max_value is None:
                max_value = Decimal('Infinity')
            res_limits.append({
                'key': slot_key,
                'min': min_value,
                'max': max_value,
            })

        accels = item.get('accelerators')
        if accels is None:
            accels = []
        else:
            accels = accels.split(',')

        return {
            'name': image_ref.name,
            'humanized_name': image_ref.name,  # TODO: implement
            'tag': image_ref.tag,
            'registry': image_ref.registry,
            'digest': item[''],
            'labels': item.get('labels', {}),
            'aliases': reverse_aliases.get(image_ref.canonical, []),
            'size_bytes': item.get('size_bytes', 0),
            'resource_limits': res_limits,
            'supported_accelerators': accels,
            'installed': installed,
            'installed_agents': installed_agents,
        }

    async def _check_image(self, reference: str) -> ImageRef:
        known_registries = await get_known_registries(self.etcd)
        ref = ImageRef(reference, known_registries)
        digest = await self.etcd.get(ref.tag_path)
        if digest is None:
            raise UnknownImageReference(reference)
        return ref

    async def inspect_image(
            self, reference: Union[str, ImageRef]) -> Mapping[str, Any]:
        if isinstance(reference, str):
            ref = await ImageRef.resolve_alias(reference, self.etcd)
        else:
            ref = reference
        reverse_aliases = await self._scan_reverse_aliases()
        image_info = await self.etcd.get_prefix(ref.tag_path)
        if not image_info:
            raise UnknownImageReference(reference)
        return await self._parse_image(ref, image_info, reverse_aliases)

    async def forget_image(self, reference: Union[str, ImageRef]) -> None:
        if isinstance(reference, str):
            ref = await ImageRef.resolve_alias(reference, self.etcd)
        else:
            ref = reference
        await self.etcd.delete_prefix(ref.tag_path)

    async def list_images(self) -> Sequence[Mapping[str, Any]]:
        known_registries = await get_known_registries(self.etcd)
        reverse_aliases = await self._scan_reverse_aliases()
        data = await self.etcd.get_prefix('images')
        coros = []
        for registry, images in data.items():
            if registry == '_aliases':
                continue
            for image, tags in images.items():
                if image == '':
                    continue
                if tags == '1':
                    continue
                for tag, image_info in tags.items():
                    if tag == '':
                        continue
                    raw_ref = f'{etcd_unquote(registry)}/{etcd_unquote(image)}:{tag}'
                    ref = ImageRef(raw_ref, known_registries)
                    coros.append(
                        self._parse_image(ref, image_info, reverse_aliases))
        result = await asyncio.gather(*coros)
        return result

    async def set_image_resource_limit(self, reference: str, slot_type: str,
                                       value_range: Tuple[Optional[Decimal],
                                                          Optional[Decimal]]):
        ref = await self._check_image(reference)
        if value_range[0] is not None:
            await self.etcd.put(f'{ref.tag_path}/resource/{slot_type}/min',
                                str(value_range[0]))
        if value_range[1] is not None:
            await self.etcd.put(f'{ref.tag_path}/resource/{slot_type}/max',
                                str(value_range[1]))

    async def rescan_images(
        self,
        registry: str = None,
        *,
        reporter: ProgressReporter = None,
    ) -> None:
        registry_config_iv = t.Mapping(t.String, container_registry_iv)
        latest_registry_config = registry_config_iv.check(
            await self.etcd.get_prefix('config/docker/registry'))
        self['docker']['registry'] = latest_registry_config
        # TODO: delete images from registries removed from the previous config?
        if registry is None:
            # scan all configured registries
            registries = self['docker']['registry']
        else:
            try:
                registries = {registry: self['docker']['registry'][registry]}
            except KeyError:
                raise RuntimeError("It is an unknown registry.", registry)
        async with aiotools.TaskGroup() as tg:
            for registry_name, registry_info in registries.items():
                log.info('Scanning kernel images from the registry "{0}"',
                         registry_name)
                scanner_cls = get_container_registry(registry_info)
                scanner = scanner_cls(self.etcd, registry_name, registry_info)
                tg.create_task(scanner.rescan_single_registry(reporter))
        # TODO: delete images removed from registry?

    async def alias(self, alias: str, target: str) -> None:
        await self.etcd.put(f'images/_aliases/{etcd_quote(alias)}', target)

    async def dealias(self, alias: str) -> None:
        await self.etcd.delete(f'images/_aliases/{etcd_quote(alias)}')

    async def update_resource_slots(
        self,
        slot_key_and_units: Mapping[SlotName, SlotTypes],
    ) -> None:
        updates = {}
        known_slots = await self.get_resource_slots()
        for k, v in slot_key_and_units.items():
            if k not in known_slots or v != known_slots[k]:
                updates[f'config/resource_slots/{k}'] = v.value
        if updates:
            await self.etcd.put_dict(updates)

    async def update_manager_status(self, status) -> None:
        await self.etcd.put('manager/status', status.value)
        self.get_manager_status.cache_clear()

    @aiotools.lru_cache(maxsize=1, expire_after=2.0)
    async def _get_resource_slots(self):
        raw_data = await self.etcd.get_prefix_dict('config/resource_slots')
        return {SlotName(k): SlotTypes(v) for k, v in raw_data.items()}

    async def get_resource_slots(self) -> Mapping[SlotName, SlotTypes]:
        """
        Returns the system-wide known resource slots and their units.
        """
        try:
            ret = current_resource_slots.get()
        except LookupError:
            configured_slots = await self._get_resource_slots()
            ret = {**INTRINSIC_SLOTS, **configured_slots}
            current_resource_slots.set(ret)
        return ret

    @aiotools.lru_cache(maxsize=1, expire_after=2.0)
    async def _get_vfolder_types(self):
        return await self.etcd.get_prefix('volumes/_types')

    async def get_vfolder_types(self) -> Sequence[str]:
        """
        Returns the vfolder types currently set. One of "user" and/or "group".
        If none is specified, "user" type is implicitly assumed.
        """
        try:
            ret = current_vfolder_types.get()
        except LookupError:
            vf_types = await self._get_vfolder_types()
            if not vf_types:
                vf_types = {'user': ''}
            ret = list(vf_types.keys())
            current_vfolder_types.set(ret)
        return ret

    @aiotools.lru_cache(maxsize=1, expire_after=5.0)
    async def get_manager_nodes_info(self):
        return await self.etcd.get_prefix_dict('nodes/manager')

    @aiotools.lru_cache(maxsize=1, expire_after=2.0)
    async def get_manager_status(self):
        status = await self.etcd.get('manager/status')
        if status is None:
            return None
        return ManagerStatus(status)

    async def watch_manager_status(self):
        async with aiotools.aclosing(
                self.etcd.watch('manager/status')) as agen:
            async for ev in agen:
                yield ev

    # TODO: refactor using contextvars in Python 3.7 so that the result is cached
    #       in a per-request basis.
    @aiotools.lru_cache(maxsize=1, expire_after=2.0)
    async def get_allowed_origins(self):
        return await self.get('config/api/allow-origins')

    # TODO: refactor using contextvars in Python 3.7 so that the result is cached
    #       in a per-request basis.
    @aiotools.lru_cache(expire_after=60.0)
    async def get_image_slot_ranges(self, image_ref: ImageRef):
        """
        Returns the minimum and maximum ResourceSlot values.
        All slot values are converted and normalized to Decimal.
        """
        data = await self.etcd.get_prefix_dict(image_ref.tag_path)
        slot_units = await self.get_resource_slots()
        min_slot = ResourceSlot()
        max_slot = ResourceSlot()

        for slot_key, slot_range in data['resource'].items():
            slot_unit = slot_units.get(slot_key)
            if slot_unit is None:
                # ignore unknown slots
                continue
            min_value = slot_range.get('min')
            if min_value is None:
                min_value = Decimal(0)
            max_value = slot_range.get('max')
            if max_value is None:
                max_value = Decimal('Infinity')
            if slot_unit == 'bytes':
                if not isinstance(min_value, Decimal):
                    min_value = BinarySize.from_str(min_value)
                if not isinstance(max_value, Decimal):
                    max_value = BinarySize.from_str(max_value)
            else:
                if not isinstance(min_value, Decimal):
                    min_value = Decimal(min_value)
                if not isinstance(max_value, Decimal):
                    max_value = Decimal(max_value)
            min_slot[slot_key] = min_value
            max_slot[slot_key] = max_value

        # fill missing
        for slot_key in slot_units.keys():
            if slot_key not in min_slot:
                min_slot[slot_key] = Decimal(0)
            if slot_key not in max_slot:
                max_slot[slot_key] = Decimal('Infinity')

        return min_slot, max_slot

    def get_redis_url(self, db: int = 0) -> yarl.URL:
        """
        Returns a complete URL composed from the given Redis config.
        """
        url = (yarl.URL('redis://host').with_host(
            str(self.data['redis']['addr'][0])).with_port(
                self.data['redis']['addr'][1]).with_password(
                    self.data['redis']['password']) / str(db))
        return url
示例#3
0
class ConfigServer:
    def __init__(self, app_ctx, etcd_addr, etcd_user, etcd_password,
                 namespace):
        # WARNING: importing etcd3/grpc must be done after forks.
        from ai.backend.common.etcd import AsyncEtcd
        self.context = app_ctx
        credentials = None
        if etcd_user:
            credentials = {
                'user': etcd_user,
                'password': etcd_password,
            }
        scope_prefix_map = {
            ConfigScopes.GLOBAL: '',
            # TODO: provide a way to specify other scope prefixes
        }
        self.etcd = AsyncEtcd(etcd_addr,
                              namespace,
                              scope_prefix_map,
                              credentials=credentials)

    async def get(self, key, allow_null=True):
        value = await self.etcd.get(key)
        if value is None:
            value = config_defaults.get(key, None)
        if not allow_null and value is None:
            raise ServerMisconfiguredError(
                'A required etcd config is missing.', key)
        return value

    async def register_myself(self, app_config):
        instance_id = await get_instance_id()
        event_addr = app_config['manager']['event-listen-addr']
        log.info('manager is listening agent events at {}', event_addr)
        event_addr = '{0.host}:{0.port}'.format(event_addr)
        manager_info = {
            'nodes/manager': instance_id,
            'nodes/manager/event_addr': event_addr,
        }
        await self.etcd.put_dict(manager_info)

    async def deregister_myself(self):
        await self.etcd.delete_prefix('nodes/manager')

    async def update_aliases_from_file(self, file: Path):
        log.info('Updating image aliases from "{0}"', file)
        try:
            data = yaml.load(open(file, 'r', encoding='utf-8'))
        except IOError:
            log.error('Cannot open "{0}".', file)
            return
        for item in data['aliases']:
            alias = item[0]
            target = item[1]
            await self.etcd.put(f'images/_aliases/{etcd_quote(alias)}', target)
            print(f'{alias} -> {target}')
        log.info('Done.')

    async def _scan_reverse_aliases(self):
        aliases = await self.etcd.get_prefix('images/_aliases')
        result = defaultdict(list)
        for key, value in aliases.items():
            result[value].append(etcd_unquote(key))
        return result

    async def _parse_image(self, image_ref, item, reverse_aliases):
        installed = (await self.context['redis_image'].scard(
            image_ref.canonical)) > 0

        res_limits = []
        for slot_key, slot_range in item['resource'].items():
            min_value = slot_range.get('min')
            if min_value is None:
                min_value = Decimal(0)
            max_value = slot_range.get('max')
            if max_value is None:
                max_value = Decimal('Infinity')
            res_limits.append({
                'key': slot_key,
                'min': min_value,
                'max': max_value,
            })

        accels = item.get('accelerators')
        if accels is None:
            accels = []
        else:
            accels = accels.split(',')

        return {
            'name': image_ref.name,
            'humanized_name': image_ref.name,  # TODO: implement
            'tag': image_ref.tag,
            'registry': image_ref.registry,
            'digest': item[''],
            'labels': item.get('labels', {}),
            'aliases': reverse_aliases.get(image_ref.canonical, []),
            'size_bytes': item.get('size_bytes', 0),
            'resource_limits': res_limits,
            'supported_accelerators': accels,
            'installed': installed,
        }

    async def _check_image(self, reference: str) -> ImageRef:
        known_registries = await get_known_registries(self.etcd)
        ref = ImageRef(reference, known_registries)
        digest = await self.etcd.get(ref.tag_path)
        if digest is None:
            raise UnknownImageReference(reference)
        return ref

    async def inspect_image(self, reference: Union[str, ImageRef]):
        if isinstance(reference, str):
            ref = await ImageRef.resolve_alias(reference, self.etcd)
        else:
            ref = reference
        reverse_aliases = await self._scan_reverse_aliases()
        image_info = await self.etcd.get_prefix(ref.tag_path)
        if not image_info:
            raise UnknownImageReference(reference)
        return await self._parse_image(ref, image_info, reverse_aliases)

    async def list_images(self):
        known_registries = await get_known_registries(self.etcd)
        reverse_aliases = await self._scan_reverse_aliases()
        data = await self.etcd.get_prefix('images')
        coros = []
        for registry, images in data.items():
            if registry == '_aliases':
                continue
            for image, tags in images.items():
                if image == '':
                    continue
                for tag, image_info in tags.items():
                    if tag == '':
                        continue
                    raw_ref = f'{etcd_unquote(registry)}/{etcd_unquote(image)}:{tag}'
                    ref = ImageRef(raw_ref, known_registries)
                    coros.append(
                        self._parse_image(ref, image_info, reverse_aliases))
        return await asyncio.gather(*coros)

    async def set_image_resource_limit(self, reference: str, slot_type: str,
                                       value_range: Tuple[Optional[Decimal],
                                                          Optional[Decimal]]):
        ref = await self._check_image(reference)
        if value_range[0] is not None:
            await self.etcd.put(f'{ref.tag_path}/resource/{slot_type}/min',
                                str(value_range[0]))
        if value_range[1] is not None:
            await self.etcd.put(f'{ref.tag_path}/resource/{slot_type}/max',
                                str(value_range[1]))

    async def _rescan_images(self, registry_name, registry_info):
        all_updates = {}
        base_hdrs = {
            'Accept': 'application/vnd.docker.distribution.manifest.v2+json',
        }
        registry_url = yarl.URL(registry_info[''])
        registry_type = registry_info.get('type', 'docker')
        registry_project = registry_info.get('project')
        credentials = {}
        username = registry_info.get('username')
        if username is not None:
            credentials['username'] = username
        password = registry_info.get('password')
        if password is not None:
            credentials['password'] = password

        async def _scan_image(sess, image):
            rqst_args = await registry_login(sess, registry_url, credentials,
                                             f'repository:{image}:pull')
            rqst_args['headers'].update(**base_hdrs)
            tags = []
            async with sess.get(registry_url / f'v2/{image}/tags/list',
                                **rqst_args) as resp:
                data = json.loads(await resp.read())
                if 'tags' in data:
                    # sometimes there are dangling image names in the hub.
                    tags.extend(data['tags'])
            scheduler = await aiojobs.create_scheduler(limit=8)
            try:
                jobs = await asyncio.gather(*[
                    scheduler.spawn(_scan_tag(sess, rqst_args, image, tag))
                    for tag in tags
                ])
                await asyncio.gather(*[job.wait() for job in jobs])
            finally:
                await scheduler.close()

        async def _scan_tag(sess, rqst_args, image, tag):
            config_digest = None
            labels = {}
            async with sess.get(registry_url / f'v2/{image}/manifests/{tag}',
                                **rqst_args) as resp:
                resp.raise_for_status()
                data = await resp.json()
                config_digest = data['config']['digest']
                size_bytes = (sum(layer['size'] for layer in data['layers']) +
                              data['config']['size'])
            async with sess.get(
                    registry_url / f'v2/{image}/blobs/{config_digest}',
                    **rqst_args) as resp:
                # content-type may not be json...
                resp.raise_for_status()
                data = json.loads(await resp.read())
                raw_labels = data['container_config']['Labels']
                if raw_labels:
                    labels.update(raw_labels)

            log.debug('checking image repository {}:{}', image, tag)
            if not labels.get('ai.backend.kernelspec'):
                # Skip non-Backend.AI kernel images
                return

            log.info('Updating metadata for {0}:{1}', image, tag)
            updates = {}
            updates[f'images/{etcd_quote(registry_name)}/'
                    f'{etcd_quote(image)}'] = '1'
            tag_prefix = f'images/{etcd_quote(registry_name)}/' \
                         f'{etcd_quote(image)}/{tag}'
            updates[tag_prefix] = config_digest
            updates[f'{tag_prefix}/size_bytes'] = size_bytes
            for k, v in labels.items():
                updates[f'{tag_prefix}/labels/{k}'] = v

            accels = labels.get('ai.backend.accelerators')
            if accels:
                updates[f'{tag_prefix}/accels'] = accels

            res_prefix = 'ai.backend.resource.min.'
            for k, v in filter(lambda pair: pair[0].startswith(res_prefix),
                               labels.items()):
                res_key = k[len(res_prefix):]
                updates[f'{tag_prefix}/resource/{res_key}/min'] = v
            all_updates.update(updates)

        ssl_ctx = None  # default
        app_config = self.context.get('config')
        if app_config is not None and not app_config['docker-registry'][
                'ssl-verify']:
            ssl_ctx = False
        connector = aiohttp.TCPConnector(ssl=ssl_ctx)
        async with aiohttp.ClientSession(connector=connector) as sess:
            images = []
            if registry_url.host.endswith('.docker.io'):
                # We need some special treatment for the Docker Hub.
                params = {'page_size': '100'}
                username = await self.etcd.get(
                    f'config/docker/registry/{etcd_quote(registry_name)}/username'
                )
                hub_url = yarl.URL('https://hub.docker.com')
                async with sess.get(hub_url / f'v2/repositories/{username}/',
                                    params=params) as resp:
                    if resp.status == 200:
                        data = await resp.json()
                        images.extend(
                            f"{username}/{item['name']}"
                            for item in data['results']
                            # a little optimization to ignore legacies
                            if not item['name'].startswith('kernel-'))
                    else:
                        log.error(
                            'Failed to fetch repository list from {0} '
                            '(status={1})', hub_url, resp.status)
            elif registry_type == 'docker':
                # In other cases, try the catalog search.
                rqst_args = await registry_login(sess, registry_url,
                                                 credentials,
                                                 'registry:catalog:*')
                async with sess.get(registry_url / 'v2/_catalog',
                                    **rqst_args) as resp:
                    if resp.status == 200:
                        data = json.loads(await resp.read())
                        images.extend(data['repositories'])
                        log.debug('found {} repositories', len(images))
                    else:
                        log.warning(
                            'Docker registry {0} does not allow/support '
                            'catalog search. (status={1})', registry_url,
                            resp.status)
            elif registry_type == 'harbor':
                if credentials:
                    rqst_args = {
                        'auth':
                        aiohttp.BasicAuth(credentials['username'],
                                          credentials['password'])
                    }
                else:
                    rqst_args = {}
                async with sess.get(registry_url / 'api/projects',
                                    params={'page_size': '100'},
                                    **rqst_args) as resp:
                    projects = await resp.json()
                    project_id = None
                    for item in projects:
                        if item['name'] == registry_project:
                            project_id = item['project_id']
                            break
                    else:
                        log.warning('There is no given project.')
                        return
                async with sess.get(registry_url / 'api/repositories',
                                    params={
                                        'project_id': project_id,
                                        'page_size': '100'
                                    },
                                    **rqst_args) as resp:
                    items = await resp.json()
                    repos = [item['name'] for item in items]
                    images.extend(repos)
            else:
                log.error('Unsupported registry type')
                return

            scheduler = await aiojobs.create_scheduler(limit=8)
            try:
                jobs = await asyncio.gather(*[
                    scheduler.spawn(_scan_image(sess, image))
                    for image in images
                ])
                await asyncio.gather(*[job.wait() for job in jobs])
            finally:
                await scheduler.close()

        if not all_updates:
            log.info('No images found in registry {0}', registry_url)
            return
        for kvlist in chunked(sorted(all_updates.items()), 16):
            await self.etcd.put_dict(dict(kvlist))

    async def rescan_images(self, registry: str = None):
        if registry is None:
            registries = []
            data = await self.etcd.get_prefix('config/docker/registry')
            for key, val in data.items():
                if key:
                    registries.append(etcd_unquote(key))
        else:
            registries = [registry]
        coros = []
        for registry in registries:
            log.info('Scanning kernel images from the registry "{0}"',
                     registry)
            registry_info = await self.etcd.get_prefix(
                f'config/docker/registry/{etcd_quote(registry)}')
            if not registry_info:
                log.error('Unknown registry: "{0}"', registry)
                continue
            coros.append(self._rescan_images(registry, registry_info))
        await asyncio.gather(*coros)
        # TODO: delete images removed from registry?

    async def alias(self, alias: str, target: str):
        await self.etcd.put(f'images/_aliases/{etcd_quote(alias)}', target)

    async def dealias(self, alias: str):
        await self.etcd.delete(f'images/_aliases/{etcd_quote(alias)}')

    async def update_volumes_from_file(self, file: Path):
        log.info('Updating network volumes from "{0}"', file)
        try:
            data = yaml.load(open(file, 'r', encoding='utf-8'))
        except IOError:
            log.error('Cannot open "{0}".', file)
            return
        for item in data['volumes']:
            name = item['name']
            updates = {
                f'volumes/{name}/mount/{k}': v
                for k, v in item['mount'].items()
            }
            await self.etcd.put_dict(updates)
        log.info('done')

    async def update_resource_slots(self,
                                    slot_key_and_units,
                                    *,
                                    clear_existing: bool = True):
        updates = {}
        if clear_existing:
            await self.etcd.delete_prefix('config/resource_slots/')
        for k, v in slot_key_and_units.items():
            if k in ('cpu', 'mem'):
                continue
            # currently we support only two units
            # (where count may be fractional)
            assert v in ('bytes', 'count')
            updates[f'config/resource_slots/{k}'] = v
        await self.etcd.put_dict(updates)

    async def update_manager_status(self, status):
        await self.etcd.put('manager/status', status.value)
        self.get_manager_status.cache_clear()

    # TODO: refactor using contextvars in Python 3.7 so that the result is cached
    #       in a per-request basis.
    @aiotools.lru_cache(maxsize=1, expire_after=2.0)
    async def get_resource_slots(self):
        '''
        Returns the system-wide known resource slots and their units.
        '''
        intrinsic_slots = {'cpu': 'count', 'mem': 'bytes'}
        configured_slots = await self.etcd.get_prefix_dict(
            'config/resource_slots')
        return {**intrinsic_slots, **configured_slots}

    @aiotools.lru_cache(maxsize=1, expire_after=60.0)
    async def get_vfolder_types(self):
        '''
        Returns the vfolder types currently set. One of "user" and/or "group".
        If none is specified, "user" type is implicitly assumed.
        '''
        vf_types = await self.etcd.get_prefix_dict('volumes/_types')
        if not vf_types:
            vf_types = {'user': ''}
        return list(vf_types.keys())

    @aiotools.lru_cache(maxsize=1, expire_after=5.0)
    async def get_manager_nodes_info(self):
        return await self.etcd.get_prefix_dict('nodes/manager')

    @aiotools.lru_cache(maxsize=1, expire_after=2.0)
    async def get_manager_status(self):
        status = await self.etcd.get('manager/status')
        return ManagerStatus(status)

    async def watch_manager_status(self):
        async for ev in self.etcd.watch('manager/status'):
            yield ev

    # TODO: refactor using contextvars in Python 3.7 so that the result is cached
    #       in a per-request basis.
    @aiotools.lru_cache(maxsize=1, expire_after=2.0)
    async def get_allowed_origins(self):
        return await self.get('config/api/allow-origins')

    # TODO: refactor using contextvars in Python 3.7 so that the result is cached
    #       in a per-request basis.
    @aiotools.lru_cache(expire_after=60.0)
    async def get_image_slot_ranges(self, image_ref: ImageRef):
        '''
        Returns the minimum and maximum ResourceSlot values.
        All slot values are converted and normalized to Decimal.
        '''
        data = await self.etcd.get_prefix_dict(image_ref.tag_path)
        slot_units = await self.get_resource_slots()
        min_slot = ResourceSlot()
        max_slot = ResourceSlot()

        for slot_key, slot_range in data['resource'].items():
            slot_unit = slot_units.get(slot_key)
            if slot_unit is None:
                # ignore unknown slots
                continue
            min_value = slot_range.get('min')
            if min_value is None:
                min_value = Decimal(0)
            max_value = slot_range.get('max')
            if max_value is None:
                max_value = Decimal('Infinity')
            if slot_unit == 'bytes':
                if not isinstance(min_value, Decimal):
                    min_value = BinarySize.from_str(min_value)
                if not isinstance(max_value, Decimal):
                    max_value = BinarySize.from_str(max_value)
            else:
                if not isinstance(min_value, Decimal):
                    min_value = Decimal(min_value)
                if not isinstance(max_value, Decimal):
                    max_value = Decimal(max_value)
            min_slot[slot_key] = min_value
            max_slot[slot_key] = max_value

        # fill missing
        for slot_key in slot_units.keys():
            if slot_key not in min_slot:
                min_slot[slot_key] = Decimal(0)
            if slot_key not in max_slot:
                max_slot[slot_key] = Decimal('Infinity')

        return min_slot, max_slot
示例#4
0
class ConfigServer:
    def __init__(self, etcd_addr, namespace):
        # WARNING: importing etcd3/grpc must be done after forks.
        from ai.backend.common.etcd import AsyncEtcd
        self.etcd = AsyncEtcd(etcd_addr, namespace)

    async def register_myself(self, app_config):
        instance_id = await get_instance_id()
        if app_config.advertised_manager_host:
            instance_ip = app_config.advertised_manager_host
            log.info('manually set advertised manager host: {0}', instance_ip)
        else:
            # fall back 1: read private IP from cloud instance metadata
            # fall back 2: read hostname and resolve it
            # fall back 3: "127.0.0.1"
            instance_ip = await get_instance_ip()
        event_addr = f'{instance_ip}:{app_config.events_port}'
        await self.etcd.put_multi([
            'nodes/manager', 'nodes/redis', 'nodes/manager/event_addr',
            'nodes/docker_registry'
        ], [
            instance_id, app_config.redis_addr, event_addr,
            app_config.docker_registry
        ])

    async def deregister_myself(self):
        await self.etcd.delete_prefix('nodes/manager')

    async def update_kernel_images_from_file(self, file: Path):
        log.info('Loading kernel image data from "{0}"', file)
        try:
            data = yaml.load(open(file, 'rb'))
        except IOError:
            log.error('Cannot open "{0}".', file)
            return
        for image in data['images']:
            name = image['name']
            print(f"Updating {name}")

            inserted_aliases = []
            for tag, hash in image['tags']:
                assert hash
                if hash.startswith(':'):  # tag-level alias
                    inserted_aliases.append((f'images/_aliases/{name}:{tag}',
                                             f'{name}:{hash[1:]}'))
            if inserted_aliases:
                await self.etcd.put_multi(*zip(*inserted_aliases))

            cpu_share = image['slots']['cpu']
            cpu_share = 'null' if cpu_share is None else f'{cpu_share:.2f}'
            mem_share = image['slots']['mem']
            mem_share = 'null' if mem_share is None else f'{mem_share:.2f}'
            gpu_share = image['slots']['gpu']
            gpu_share = 'null' if gpu_share is None else f'{gpu_share:.2f}'
            await self.etcd.put_multi([
                f'images/{name}', f'images/{name}/cpu', f'images/{name}/mem',
                f'images/{name}/gpu'
            ], ['1', cpu_share, mem_share, gpu_share])

            inserted_tags = [(f'images/{name}/tags/{tag}', hash)
                             for tag, hash in image['tags']]
            await self.etcd.put_multi(*zip(*inserted_tags))
        log.info('Done.')

    async def update_aliases_from_file(self, file: Path):
        log.info('Updating image aliases from "{0}"', file)
        try:
            data = yaml.load(open(file, 'rb'))
        except IOError:
            log.error('Cannot open "{0}".', file)
            return
        for item in data['aliases']:
            alias = item[0]
            target = item[1]
            await self.etcd.put(f'images/_aliases/{alias}', target)
            print(f'{alias} -> {target}')
        log.info('Done.')

    async def update_kernel_images_from_registry(self, registry_addr):
        log.info('Scanning kernel image versions from "{0}"', registry_addr)
        # TODO: a method to scan docker hub and update kernel image versions
        # TODO: a cli command to execute the above method
        raise NotImplementedError

    async def update_volumes_from_file(self, file: Path):
        log.info('Updating network volumes from "{0}"', file)
        try:
            data = yaml.load(open(file, 'rb'))
        except IOError:
            log.error('Cannot open "{0}".', file)
            return
        for item in data['volumes']:
            name = item['name']
            ks = []
            vs = []
            for k, v in item['mount'].items():
                ks.append(f'volumes/{name}/mount/{k}')
                vs.append(v)
            await self.etcd.put_multi(ks, vs)
        log.info('done')

    async def manager_status_update(self):
        async for ev in self.etcd.watch('manager/status'):
            yield ev

    async def update_manager_status(self, status):
        await self.etcd.put('manager/status', status.value)

    @aiotools.lru_cache(maxsize=1)
    async def get_manager_status(self):
        status = await self.etcd.get('manager/status')
        return ManagerStatus(status)

    @aiotools.lru_cache(maxsize=1, expire_after=60.0)
    async def get_allowed_origins(self):
        origins = await self.etcd.get('config/api/allow-origins')
        if origins is None:
            origins = '*'
        return origins

    @aiotools.lru_cache(maxsize=1)
    async def get_docker_registry(self):
        docker_registry = await self.etcd.get('nodes/docker_registry')
        return docker_registry

    @aiotools.lru_cache(maxsize=1, expire_after=60.0)
    async def get_overbook_factors(self):
        '''
        Retrieves the overbook parameters which is used to
        scale the resource slot values reported by the agent
        to increase server utilization.

        TIP: If your users run mostly compute-intesive sessions,
        lower these values towards 1.0.
        '''

        cpu = await self.etcd.get('config/overbook/cpu')
        cpu = 6.0 if cpu is None else float(cpu)
        mem = await self.etcd.get('config/overbook/mem')
        mem = 2.0 if mem is None else float(mem)
        gpu = await self.etcd.get('config/overbook/gpu')
        gpu = 1.0 if gpu is None else float(gpu)
        return {
            'mem': mem,
            'cpu': cpu,
            'gpu': gpu,
        }

    @aiotools.lru_cache(expire_after=60.0)
    async def get_image_required_slots(self, name, tag):
        installed = await self.etcd.get(f'images/{name}')
        if installed is None:
            raise RuntimeError('Image metadata is not available!')
        cpu = await self.etcd.get(f'images/{name}/cpu')
        cpu = None if cpu == 'null' else Decimal(cpu)
        mem = await self.etcd.get(f'images/{name}/mem')
        mem = None if mem == 'null' else Decimal(mem)
        if '-gpu' in tag or '-cuda' in tag:
            gpu = await self.etcd.get(f'images/{name}/gpu')
            gpu = Decimal(0) if gpu == 'null' else Decimal(gpu)
        else:
            gpu = Decimal(0)
        return ResourceSlot(mem=mem, cpu=cpu, gpu=gpu)

    @aiotools.lru_cache(expire_after=60.0)
    async def resolve_image_name(self, name_or_alias):
        async def resolve_alias(alias_key):
            alias_target = None
            while True:
                prev_alias_key = alias_key
                alias_key = await self.etcd.get(f'images/_aliases/{alias_key}')
                if alias_key is None:
                    alias_target = prev_alias_key
                    break
            return alias_target

        alias_target = await resolve_alias(name_or_alias)
        if alias_target == name_or_alias and name_or_alias.rfind(':') == -1:
            alias_target = await resolve_alias(f'{name_or_alias}:latest')
        assert alias_target is not None
        name, _, tag = alias_target.partition(':')
        hash = await self.etcd.get(f'images/{name}/tags/{tag}')
        if hash is None:
            raise ImageNotFound(f'{name_or_alias}: Unregistered image '
                                'or unknown alias.')
        return name, tag