async def restore_from_container(
     self,
     container: Container,
     alloc_map: AbstractAllocMap,
 ) -> None:
     if not self.enabled:
         return
     resource_spec = await get_resource_spec_from_container(container.backend_obj)
     if resource_spec is None:
         return
     if hasattr(alloc_map, 'apply_allocation'):
         alloc_map.apply_allocation({
             SlotName('cuda.device'): resource_spec.allocations.get(
                 DeviceName('cuda'), {}
             ).get(
                 SlotName('cuda.device'), {}
             ),
         })
     else:
         alloc_map.allocations[SlotName('cuda.device')].update(
             resource_spec.allocations.get(
                 DeviceName('cuda'), {}
             ).get(
                 SlotName('cuda.device'), {}
             )
         )
示例#2
0
 async def restore_from_container(
     self,
     container: Container,
     alloc_map: AbstractAllocMap,
 ) -> None:
     assert isinstance(alloc_map, DiscretePropertyAllocMap)
     # Docker does not return the original cpuset.... :(
     # We need to read our own records.
     resource_spec = await get_resource_spec_from_container(
         container.backend_obj)
     if resource_spec is None:
         return
     alloc_map.apply_allocation({
         SlotName('cpu'):
         resource_spec.allocations[DeviceName('cpu')][SlotName('cpu')],
     })
示例#3
0
 def read_from_string(cls, text: str) -> 'KernelResourceSpec':
     kvpairs = {}
     for line in text.split('\n'):
         if '=' not in line:
             continue
         key, val = line.strip().split('=', maxsplit=1)
         kvpairs[key] = val
     allocations = cast(
         MutableMapping[DeviceName, MutableMapping[SlotName,
                                                   Mapping[DeviceId,
                                                           Decimal]]],
         defaultdict(lambda: defaultdict(Decimal)),
     )
     for key, val in kvpairs.items():
         if key.endswith('_SHARES'):
             slot_name = SlotName(key[:-7].lower())
             device_name = DeviceName(slot_name.split('.')[0])
             per_device_alloc: MutableMapping[DeviceId, Decimal] = {}
             for entry in val.split(','):
                 raw_dev_id, _, raw_alloc = entry.partition(':')
                 if not raw_dev_id or not raw_alloc:
                     continue
                 dev_id = DeviceId(raw_dev_id)
                 try:
                     if known_slot_types.get(slot_name, 'count') == 'bytes':
                         alloc = Decimal(BinarySize.from_str(raw_alloc))
                     else:
                         alloc = Decimal(raw_alloc)
                 except KeyError as e:
                     log.warning(
                         'A previously launched container has '
                         'unknown slot type: {}. Ignoring it.', e.args[0])
                     continue
                 per_device_alloc[dev_id] = alloc
             allocations[device_name][slot_name] = per_device_alloc
     mounts = [Mount.from_str(m) for m in kvpairs['MOUNTS'].split(',') if m]
     return cls(
         container_id=kvpairs.get('CID', 'unknown'),
         scratch_disk_size=BinarySize.finite_from_str(
             kvpairs['SCRATCH_SIZE']),
         allocations=dict(allocations),
         slots=ResourceSlot(json.loads(kvpairs['SLOTS'])),
         mounts=mounts,
     )
class CUDAPlugin(AbstractComputePlugin):

    config_watch_enabled = False

    key = DeviceName('cuda')
    slot_types: Sequence[Tuple[SlotName, SlotTypes]] = (
        (SlotName('cuda.device'), SlotTypes('count')),
    )

    nvdocker_version: Tuple[int, ...] = (0, 0, 0)
    docker_version: Tuple[int, ...] = (0, 0, 0)

    device_mask: Sequence[DeviceId] = []
    enabled: bool = True

    async def init(self, context: Any = None) -> None:
        rx_triple_version = re.compile(r'(\d+\.\d+\.\d+)')
        # Check nvidia-docker and docker versions
        try:
            proc = await asyncio.create_subprocess_exec(
                'nvidia-docker', 'version', '-f', '{{json .}}',
                stdout=asyncio.subprocess.PIPE,
            )
            stdout, _ = await proc.communicate()
            lines = stdout.decode().splitlines()
        except FileNotFoundError:
            log.warning('nvidia-docker is not installed.')
            log.info('CUDA acceleration is disabled.')
            self.enabled = False
            return
        m = rx_triple_version.search(lines[0])
        if m:
            self.nvdocker_version = tuple(map(int, m.group(1).split('.')))
        else:
            log.error('could not detect nvidia-docker version!')
            log.info('CUDA acceleration is disabled.')
            self.enabled = False
            return
        docker_version_data = json.loads(lines[1])
        m = rx_triple_version.search(docker_version_data['Server']['Version'])
        if m:
            self.docker_version = tuple(map(int, m.group(1).split('.')))
        else:
            log.error('could not detect docker version!')
            log.info('CUDA acceleration is disabled.')
            self.enabled = False
            return

        raw_device_mask = self.plugin_config.get('device_mask')
        if raw_device_mask is not None:
            self.device_mask = [
                *map(lambda dev_id: DeviceId(dev_id), raw_device_mask.split(','))
            ]
        try:
            detected_devices = await self.list_devices()
            log.info('detected devices:\n' + pformat(detected_devices))
            log.info('nvidia-docker version: {}', self.nvdocker_version)
            log.info('CUDA acceleration is enabled.')
        except ImportError:
            log.warning('could not load the CUDA runtime library.')
            log.info('CUDA acceleration is disabled.')
            self.enabled = False
        except RuntimeError as e:
            log.warning('CUDA init error: {}', e)
            log.info('CUDA acceleration is disabled.')
            self.enabled = False

    async def cleanup(self) -> None:
        pass

    async def update_plugin_config(
        self,
        new_plugin_config: Mapping[str, Any],
    ) -> None:
        pass

    async def list_devices(self) -> Collection[CUDADevice]:
        if not self.enabled:
            return []
        all_devices = []
        num_devices = libcudart.get_device_count()
        for dev_id in map(lambda idx: DeviceId(str(idx)), range(num_devices)):
            if dev_id in self.device_mask:
                continue
            raw_info = libcudart.get_device_props(int(dev_id))
            sysfs_node_path = "/sys/bus/pci/devices/" \
                              f"{raw_info['pciBusID_str'].lower()}/numa_node"
            node: Optional[int]
            try:
                node = int(Path(sysfs_node_path).read_text().strip())
            except OSError:
                node = None
            dev_uuid, raw_dev_uuid = None, raw_info.get('uuid', None)
            if raw_dev_uuid is not None:
                dev_uuid = str(uuid.UUID(bytes=raw_dev_uuid))
            else:
                dev_uuid = '00000000-0000-0000-0000-000000000000'
            dev_info = CUDADevice(
                device_id=dev_id,
                hw_location=raw_info['pciBusID_str'],
                numa_node=node,
                memory_size=raw_info['totalGlobalMem'],
                processing_units=raw_info['multiProcessorCount'],
                model_name=raw_info['name'],
                uuid=dev_uuid,
            )
            all_devices.append(dev_info)
        return all_devices

    async def available_slots(self) -> Mapping[SlotName, Decimal]:
        devices = await self.list_devices()
        return {
            SlotName('cuda.device'): Decimal(len(devices)),
        }

    def get_version(self) -> str:
        return __version__

    async def extra_info(self) -> Mapping[str, Any]:
        if self.enabled:
            try:
                return {
                    'cuda_support': True,
                    'nvidia_version': libnvml.get_driver_version(),
                    'cuda_version': '{0[0]}.{0[1]}'.format(libcudart.get_version()),
                }
            except ImportError:
                log.warning('extra_info(): NVML/CUDA runtime library is not found')
            except LibraryError as e:
                log.warning('extra_info(): {!r}', e)
        return {
            'cuda_support': False,
        }

    async def gather_node_measures(
        self,
        ctx: StatContext,
    ) -> Sequence[NodeMeasurement]:
        dev_count = 0
        mem_avail_total = 0
        mem_used_total = 0
        mem_stats = {}
        util_total = 0
        util_stats = {}
        if self.enabled:
            try:
                dev_count = libnvml.get_device_count()
                for dev_id in map(lambda idx: DeviceId(str(idx)), range(dev_count)):
                    if dev_id in self.device_mask:
                        continue
                    dev_stat = libnvml.get_device_stats(int(dev_id))
                    mem_avail_total += dev_stat.mem_total
                    mem_used_total += dev_stat.mem_used
                    mem_stats[dev_id] = Measurement(Decimal(dev_stat.mem_used),
                                                    Decimal(dev_stat.mem_total))
                    util_total += dev_stat.gpu_util
                    util_stats[dev_id] = Measurement(Decimal(dev_stat.gpu_util), Decimal(100))
            except ImportError:
                log.warning('gather_node_measure(): NVML library is not found')
            except LibraryError as e:
                log.warning('gather_node_measure(): {!r}', e)
        return [
            NodeMeasurement(
                MetricKey('cuda_mem'),
                MetricTypes.USAGE,
                unit_hint='bytes',
                stats_filter=frozenset({'max'}),
                per_node=Measurement(Decimal(mem_used_total), Decimal(mem_avail_total)),
                per_device=mem_stats,
            ),
            NodeMeasurement(
                MetricKey('cuda_util'),
                MetricTypes.USAGE,
                unit_hint='percent',
                stats_filter=frozenset({'avg', 'max'}),
                per_node=Measurement(Decimal(util_total), Decimal(dev_count * 100)),
                per_device=util_stats,
            ),
        ]

    async def gather_container_measures(
            self, ctx: StatContext,
            container_ids: Sequence[str],
            ) -> Sequence[ContainerMeasurement]:
        return []

    async def create_alloc_map(self) -> AbstractAllocMap:
        devices = await self.list_devices()
        return DiscretePropertyAllocMap(
            device_slots={
                dev.device_id: (
                    DeviceSlotInfo(SlotTypes.COUNT, SlotName('cuda.device'), Decimal(1))
                ) for dev in devices
            },
        )

    async def get_hooks(self, distro: str, arch: str) -> Sequence[Path]:
        return []

    async def generate_docker_args(
        self,
        docker: aiodocker.Docker,
        device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]],
    ) -> Mapping[str, Any]:
        if not self.enabled:
            return {}
        assigned_device_ids = []
        for slot_type, per_device_alloc in device_alloc.items():
            for device_id, alloc in per_device_alloc.items():
                if alloc > 0:
                    assigned_device_ids.append(device_id)
        if self.nvdocker_version[0] == 1:
            timeout = aiohttp.ClientTimeout(total=3)
            async with aiohttp.ClientSession(raise_for_status=True,
                                             timeout=timeout) as sess:
                try:
                    nvdocker_url = 'http://localhost:3476/docker/cli/json'
                    async with sess.get(nvdocker_url) as resp:
                        nvidia_params = await resp.json()
                except aiohttp.ClientError:
                    raise RuntimeError('NVIDIA Docker plugin is not available.')

            volumes = await docker.volumes.list()
            existing_volumes = set(vol['Name'] for vol in volumes['Volumes'])
            required_volumes = set(vol.split(':')[0]
                                   for vol in nvidia_params['Volumes'])
            missing_volumes = required_volumes - existing_volumes
            binds = []
            for vol_name in missing_volumes:
                for vol_param in nvidia_params['Volumes']:
                    if vol_param.startswith(vol_name + ':'):
                        _, _, permission = vol_param.split(':')
                        driver = nvidia_params['VolumeDriver']
                        await docker.volumes.create({
                            'Name': vol_name,
                            'Driver': driver,
                        })
            for vol_name in required_volumes:
                for vol_param in nvidia_params['Volumes']:
                    if vol_param.startswith(vol_name + ':'):
                        _, mount_pt, permission = vol_param.split(':')
                        binds.append('{}:{}:{}'.format(
                            vol_name, mount_pt, permission))
            devices = []
            for dev in nvidia_params['Devices']:
                m = re.search(r'^/dev/nvidia(\d+)$', dev)
                if m is None:
                    # Always add non-GPU device files required by the driver.
                    # (e.g., nvidiactl, nvidia-uvm, ... etc.)
                    devices.append(dev)
                    continue
                device_id = m.group(1)
                if device_id not in assigned_device_ids:
                    continue
                devices.append(dev)
            devices = [{
                'PathOnHost': dev,
                'PathInContainer': dev,
                'CgroupPermissions': 'mrw',
            } for dev in devices]
            return {
                'HostConfig': {
                    'Binds': binds,
                    'Devices': devices,
                },
            }
        elif self.nvdocker_version[0] == 2:
            device_list_str = ','.join(sorted(assigned_device_ids))
            if self.docker_version >= (19, 3, 0):
                docker_config: Dict[str, Any] = {}
                if assigned_device_ids:
                    docker_config.update({
                        'HostConfig': {
                            'DeviceRequests': [
                                {
                                    "Driver": "nvidia",
                                    "DeviceIDs": assigned_device_ids,
                                    # "all" does not work here
                                    "Capabilities": [
                                        ["utility", "compute", "video", "graphics", "display"]
                                    ],
                                },
                            ],
                        },
                    })
                return docker_config
            else:
                return {
                    'HostConfig': {
                        'Runtime': 'nvidia',
                    },
                    'Env': [
                        f"NVIDIA_VISIBLE_DEVICES={device_list_str}",
                    ],
                }
        else:
            raise RuntimeError('BUG: should not be reached here!')

    async def get_attached_devices(
        self,
        device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]],
    ) -> Sequence[DeviceModelInfo]:
        device_ids: List[DeviceId] = []
        if SlotName('cuda.devices') in device_alloc:
            device_ids.extend(device_alloc[SlotName('cuda.devices')].keys())
        available_devices = await self.list_devices()
        attached_devices: List[DeviceModelInfo] = []
        for device in available_devices:
            if device.device_id in device_ids:
                proc = device.processing_units
                mem = BinarySize(device.memory_size)
                attached_devices.append({  # TODO: update common.types.DeviceModelInfo
                    'device_id': device.device_id,
                    'model_name': device.model_name,
                    'smp': proc,
                    'mem': mem,
                })
        return attached_devices

    async def restore_from_container(
        self,
        container: Container,
        alloc_map: AbstractAllocMap,
    ) -> None:
        if not self.enabled:
            return
        resource_spec = await get_resource_spec_from_container(container.backend_obj)
        if resource_spec is None:
            return
        if hasattr(alloc_map, 'apply_allocation'):
            alloc_map.apply_allocation({
                SlotName('cuda.device'): resource_spec.allocations.get(
                    DeviceName('cuda'), {}
                ).get(
                    SlotName('cuda.device'), {}
                ),
            })
        else:
            alloc_map.allocations[SlotName('cuda.device')].update(
                resource_spec.allocations.get(
                    DeviceName('cuda'), {}
                ).get(
                    SlotName('cuda.device'), {}
                )
            )

    async def generate_resource_data(
        self,
        device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]],
    ) -> Mapping[str, str]:
        data: MutableMapping[str, str] = {}
        if not self.enabled:
            return data

        active_device_id_set: Set[DeviceId] = set()
        for slot_type, per_device_alloc in device_alloc.items():
            for dev_id, alloc in per_device_alloc.items():
                if alloc > 0:
                    active_device_id_set.add(dev_id)
        active_device_ids = sorted(active_device_id_set, key=lambda v: int(v))
        data['CUDA_GLOBAL_DEVICE_IDS'] = ','.join(
            f'{local_idx}:{global_id}'
            for local_idx, global_id in enumerate(active_device_ids))
        return data
示例#5
0
class MemoryPlugin(AbstractComputePlugin):
    """
    Represents the main memory.

    When collecting statistics, it also measures network and I/O usage
    in addition to the memory usage.
    """

    config_watch_enabled = False

    key = DeviceName('mem')
    slot_types = [(SlotName('mem'), SlotTypes.BYTES)]

    async def init(self, context: Any = None) -> None:
        pass

    async def cleanup(self) -> None:
        pass

    async def update_plugin_config(
            self, new_plugin_config: Mapping[str, Any]) -> None:
        pass

    async def list_devices(self) -> Collection[MemoryDevice]:
        # TODO: support NUMA?
        memory_size = psutil.virtual_memory().total
        return [
            MemoryDevice(
                device_id=DeviceId('root'),
                hw_location='root',
                numa_node=0,
                memory_size=memory_size,
                processing_units=0,
            )
        ]

    async def available_slots(self) -> Mapping[SlotName, Decimal]:
        devices = await self.list_devices()
        return {
            SlotName('mem'): Decimal(sum(dev.memory_size for dev in devices)),
        }

    def get_version(self) -> str:
        return __version__

    async def extra_info(self) -> Mapping[str, str]:
        return {}

    async def gather_node_measures(
            self, ctx: StatContext) -> Sequence[NodeMeasurement]:
        _mstat = psutil.virtual_memory()
        total_mem_used_bytes = Decimal(_mstat.total - _mstat.available)
        total_mem_capacity_bytes = Decimal(_mstat.total)
        _nstat = psutil.net_io_counters()
        net_rx_bytes = _nstat.bytes_recv
        net_tx_bytes = _nstat.bytes_sent

        def get_disk_stat():
            pruned_disk_types = frozenset(['squashfs', 'vfat', 'tmpfs'])
            total_disk_usage = Decimal(0)
            total_disk_capacity = Decimal(0)
            per_disk_stat = {}
            for disk_info in psutil.disk_partitions():
                if disk_info.fstype not in pruned_disk_types:
                    dstat = os.statvfs(disk_info.mountpoint)
                    disk_usage = Decimal(dstat.f_frsize *
                                         (dstat.f_blocks - dstat.f_bavail))
                    disk_capacity = Decimal(dstat.f_frsize * dstat.f_blocks)
                    per_disk_stat[disk_info.device] = Measurement(
                        disk_usage, disk_capacity)
                    total_disk_usage += disk_usage
                    total_disk_capacity += disk_capacity
            return total_disk_usage, total_disk_capacity, per_disk_stat

        loop = current_loop()
        total_disk_usage, total_disk_capacity, per_disk_stat = \
            await loop.run_in_executor(None, get_disk_stat)
        return [
            NodeMeasurement(
                MetricKey('mem'),
                MetricTypes.USAGE,
                unit_hint='bytes',
                stats_filter=frozenset({'max'}),
                per_node=Measurement(total_mem_used_bytes,
                                     total_mem_capacity_bytes),
                per_device={
                    DeviceId('root'):
                    Measurement(total_mem_used_bytes, total_mem_capacity_bytes)
                },
            ),
            NodeMeasurement(
                MetricKey('disk'),
                MetricTypes.USAGE,
                unit_hint='bytes',
                per_node=Measurement(total_disk_usage, total_disk_capacity),
                per_device=per_disk_stat,
            ),
            NodeMeasurement(
                MetricKey('net_rx'),
                MetricTypes.RATE,
                unit_hint='bps',
                current_hook=lambda metric: metric.stats.rate,
                per_node=Measurement(Decimal(net_rx_bytes)),
                per_device={
                    DeviceId('node'): Measurement(Decimal(net_rx_bytes))
                },
            ),
            NodeMeasurement(
                MetricKey('net_tx'),
                MetricTypes.RATE,
                unit_hint='bps',
                current_hook=lambda metric: metric.stats.rate,
                per_node=Measurement(Decimal(net_tx_bytes)),
                per_device={
                    DeviceId('node'): Measurement(Decimal(net_tx_bytes))
                },
            ),
        ]

    async def gather_container_measures(self, ctx: StatContext, container_ids: Sequence[str]) \
            -> Sequence[ContainerMeasurement]:
        def get_scratch_size(container_id: str) -> int:
            for kernel_id, info in ctx.agent.kernel_registry.items():
                if info['container_id'] == container_id:
                    break
            else:
                return 0
            work_dir = ctx.agent.local_config['container'][
                'scratch-root'] / str(kernel_id) / 'work'
            total_size = 0
            for path in work_dir.rglob('*'):
                if path.is_symlink():
                    total_size += path.lstat().st_size
                elif path.is_file():
                    total_size += path.stat().st_size
            return total_size

        async def sysfs_impl(container_id):
            mem_prefix = f'/sys/fs/cgroup/memory/docker/{container_id}/'
            io_prefix = f'/sys/fs/cgroup/blkio/docker/{container_id}/'
            try:
                mem_cur_bytes = read_sysfs(
                    mem_prefix + 'memory.usage_in_bytes', int)
                io_stats = Path(io_prefix +
                                'blkio.throttle.io_service_bytes').read_text()
                # example data:
                #   8:0 Read 13918208
                #   8:0 Write 0
                #   8:0 Sync 0
                #   8:0 Async 13918208
                #   8:0 Total 13918208
                #   Total 13918208
                io_read_bytes = 0
                io_write_bytes = 0
                for line in io_stats.splitlines():
                    if line.startswith('Total '):
                        continue
                    dev, op, nbytes = line.strip().split()
                    if op == 'Read':
                        io_read_bytes += int(nbytes)
                    elif op == 'Write':
                        io_write_bytes += int(nbytes)
            except IOError as e:
                log.warning(
                    'cannot read stats: sysfs unreadable for container {0}\n{1!r}',
                    container_id[:7], e)
                return None
            loop = current_loop()
            scratch_sz = await loop.run_in_executor(None, get_scratch_size,
                                                    container_id)
            return mem_cur_bytes, io_read_bytes, io_write_bytes, scratch_sz

        async def api_impl(container_id):
            container = DockerContainer(ctx.agent.docker, id=container_id)
            ret = await fetch_api_stats(container)
            if ret is None:
                return None
            mem_cur_bytes = nmget(ret, 'memory_stats.usage', 0)
            io_read_bytes = 0
            io_write_bytes = 0
            for item in nmget(ret, 'blkio_stats.io_service_bytes_recursive',
                              []):
                if item['op'] == 'Read':
                    io_read_bytes += item['value']
                elif item['op'] == 'Write':
                    io_write_bytes += item['value']
            loop = current_loop()
            scratch_sz = await loop.run_in_executor(None, get_scratch_size,
                                                    container_id)
            return mem_cur_bytes, io_read_bytes, io_write_bytes, scratch_sz

        if ctx.mode == StatModes.CGROUP:
            impl = sysfs_impl
        elif ctx.mode == StatModes.DOCKER:
            impl = api_impl
        else:
            raise RuntimeError("should not reach here")

        per_container_mem_used_bytes = {}
        per_container_io_read_bytes = {}
        per_container_io_write_bytes = {}
        per_container_io_scratch_size = {}
        tasks = []
        for cid in container_ids:
            tasks.append(asyncio.ensure_future(impl(cid)))
        results = await asyncio.gather(*tasks)
        for cid, result in zip(container_ids, results):
            if result is None:
                continue
            per_container_mem_used_bytes[cid] = Measurement(Decimal(result[0]))
            per_container_io_read_bytes[cid] = Measurement(Decimal(result[1]))
            per_container_io_write_bytes[cid] = Measurement(Decimal(result[2]))
            per_container_io_scratch_size[cid] = Measurement(Decimal(
                result[3]))
        return [
            ContainerMeasurement(
                MetricKey('mem'),
                MetricTypes.USAGE,
                unit_hint='bytes',
                stats_filter=frozenset({'max'}),
                per_container=per_container_mem_used_bytes,
            ),
            ContainerMeasurement(
                MetricKey('io_read'),
                MetricTypes.USAGE,
                unit_hint='bytes',
                stats_filter=frozenset({'rate'}),
                per_container=per_container_io_read_bytes,
            ),
            ContainerMeasurement(
                MetricKey('io_write'),
                MetricTypes.USAGE,
                unit_hint='bytes',
                stats_filter=frozenset({'rate'}),
                per_container=per_container_io_write_bytes,
            ),
            ContainerMeasurement(
                MetricKey('io_scratch_size'),
                MetricTypes.USAGE,
                unit_hint='bytes',
                stats_filter=frozenset({'max'}),
                per_container=per_container_io_scratch_size,
            ),
        ]

    async def create_alloc_map(self) -> AbstractAllocMap:
        devices = await self.list_devices()
        return DiscretePropertyAllocMap(device_slots={
            dev.device_id: DeviceSlotInfo(SlotTypes.BYTES, SlotName('mem'),
                                          Decimal(dev.memory_size))
            for dev in devices
        }, )

    async def get_hooks(self, distro: str, arch: str) -> Sequence[Path]:
        return []

    async def generate_docker_args(
        self,
        docker: Docker,
        device_alloc,
    ) -> Mapping[str, Any]:
        memory = sum(device_alloc['mem'].values())
        return {
            'HostConfig': {
                'MemorySwap': int(memory),  # prevent using swap!
                'Memory': int(memory),
            }
        }

    async def restore_from_container(
        self,
        container: Container,
        alloc_map: AbstractAllocMap,
    ) -> None:
        assert isinstance(alloc_map, DiscretePropertyAllocMap)
        memory_limit = container.backend_obj['HostConfig']['Memory']
        alloc_map.apply_allocation({
            SlotName('mem'): {
                DeviceId('root'): memory_limit
            },
        })

    async def get_attached_devices(
        self,
        device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]],
    ) -> Sequence[DeviceModelInfo]:
        device_ids = [*device_alloc[SlotName('mem')].keys()]
        available_devices = await self.list_devices()
        attached_devices: List[DeviceModelInfo] = []
        for device in available_devices:
            if device.device_id in device_ids:
                attached_devices.append({
                    'device_id': device.device_id,
                    'model_name': '',
                    'data': {},
                })
        return attached_devices
示例#6
0
class CPUPlugin(AbstractComputePlugin):
    """
    Represents the CPU.
    """

    config_watch_enabled = False

    key = DeviceName('cpu')
    slot_types = [(SlotName('cpu'), SlotTypes.COUNT)]

    async def init(self, context: Any = None) -> None:
        pass

    async def cleanup(self) -> None:
        pass

    async def update_plugin_config(
            self, new_plugin_config: Mapping[str, Any]) -> None:
        pass

    async def list_devices(self) -> Collection[CPUDevice]:
        cores = await libnuma.get_available_cores()
        overcommit_factor = int(
            os.environ.get('BACKEND_CPU_OVERCOMMIT_FACTOR', '1'))
        assert 1 <= overcommit_factor <= 4
        return [
            CPUDevice(
                device_id=DeviceId(str(core_idx)),
                hw_location='root',
                numa_node=libnuma.node_of_cpu(core_idx),
                memory_size=0,
                processing_units=1 * overcommit_factor,
            ) for core_idx in sorted(cores)
        ]

    async def available_slots(self) -> Mapping[SlotName, Decimal]:
        devices = await self.list_devices()
        return {
            SlotName('cpu'):
            Decimal(sum(dev.processing_units for dev in devices)),
        }

    def get_version(self) -> str:
        return __version__

    async def extra_info(self) -> Mapping[str, str]:
        return {
            'agent_version': __version__,
            'machine': platform.machine(),
            'os_type': platform.system(),
        }

    async def gather_node_measures(
            self, ctx: StatContext) -> Sequence[NodeMeasurement]:
        _cstat = psutil.cpu_times(True)
        q = Decimal('0.000')
        total_cpu_used = cast(
            Decimal,
            sum((Decimal(c.user + c.system) * 1000).quantize(q)
                for c in _cstat))
        now, raw_interval = ctx.update_timestamp('cpu-node')
        interval = Decimal(raw_interval * 1000).quantize(q)

        return [
            NodeMeasurement(
                MetricKey('cpu_util'),
                MetricTypes.UTILIZATION,
                unit_hint='msec',
                current_hook=lambda metric: metric.stats.diff,
                per_node=Measurement(total_cpu_used, interval),
                per_device={
                    DeviceId(str(idx)): Measurement(
                        (Decimal(c.user + c.system) * 1000).quantize(q),
                        interval,
                    )
                    for idx, c in enumerate(_cstat)
                },
            ),
        ]

    async def gather_container_measures(
        self,
        ctx: StatContext,
        container_ids: Sequence[str],
    ) -> Sequence[ContainerMeasurement]:
        async def sysfs_impl(container_id):
            cpu_prefix = f'/sys/fs/cgroup/cpuacct/docker/{container_id}/'
            try:
                cpu_used = read_sysfs(cpu_prefix + 'cpuacct.usage', int) / 1e6
            except IOError as e:
                log.warning(
                    'cannot read stats: sysfs unreadable for container {0}\n{1!r}',
                    container_id[:7], e)
                return None
            return cpu_used

        async def api_impl(container_id):
            container = DockerContainer(ctx.agent.docker, id=container_id)
            ret = await fetch_api_stats(container)
            if ret is None:
                return None
            cpu_used = nmget(ret, 'cpu_stats.cpu_usage.total_usage', 0) / 1e6
            return cpu_used

        if ctx.mode == StatModes.CGROUP:
            impl = sysfs_impl
        elif ctx.mode == StatModes.DOCKER:
            impl = api_impl
        else:
            raise RuntimeError("should not reach here")

        q = Decimal('0.000')
        per_container_cpu_used = {}
        tasks = []
        for cid in container_ids:
            tasks.append(asyncio.ensure_future(impl(cid)))
        results = await asyncio.gather(*tasks)
        for cid, cpu_used in zip(container_ids, results):
            if cpu_used is None:
                continue
            per_container_cpu_used[cid] = Measurement(
                Decimal(cpu_used).quantize(q))
        return [
            ContainerMeasurement(
                MetricKey('cpu_util'),
                MetricTypes.UTILIZATION,
                unit_hint='percent',
                current_hook=lambda metric: metric.stats.diff,
                stats_filter=frozenset({'avg', 'max'}),
                per_container=per_container_cpu_used,
            ),
            ContainerMeasurement(
                MetricKey('cpu_used'),
                MetricTypes.USAGE,
                unit_hint='msec',
                per_container=per_container_cpu_used.copy(),
            ),
        ]

    async def create_alloc_map(self) -> AbstractAllocMap:
        devices = await self.list_devices()
        return DiscretePropertyAllocMap(device_slots={
            dev.device_id: DeviceSlotInfo(SlotTypes.COUNT, SlotName('cpu'),
                                          Decimal(dev.processing_units))
            for dev in devices
        }, )

    async def get_hooks(self, distro: str, arch: str) -> Sequence[Path]:
        # TODO: move the sysconf hook in libbaihook.so here
        return []

    async def generate_docker_args(
        self,
        docker: Docker,
        device_alloc,
    ) -> Mapping[str, Any]:
        cores = [*map(int, device_alloc['cpu'].keys())]
        sorted_core_ids = [*map(str, sorted(cores))]
        return {
            'HostConfig': {
                'CpuPeriod': 100_000,  # docker default
                'CpuQuota': int(100_000 * len(cores)),
                'Cpus': ','.join(sorted_core_ids),
                'CpusetCpus': ','.join(sorted_core_ids),
                # 'CpusetMems': f'{resource_spec.numa_node}',
            }
        }

    async def restore_from_container(
        self,
        container: Container,
        alloc_map: AbstractAllocMap,
    ) -> None:
        assert isinstance(alloc_map, DiscretePropertyAllocMap)
        # Docker does not return the original cpuset.... :(
        # We need to read our own records.
        resource_spec = await get_resource_spec_from_container(
            container.backend_obj)
        if resource_spec is None:
            return
        alloc_map.apply_allocation({
            SlotName('cpu'):
            resource_spec.allocations[DeviceName('cpu')][SlotName('cpu')],
        })

    async def get_attached_devices(
        self,
        device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]],
    ) -> Sequence[DeviceModelInfo]:
        device_ids = [*device_alloc[SlotName('cpu')].keys()]
        available_devices = await self.list_devices()
        attached_devices: List[DeviceModelInfo] = []
        for device in available_devices:
            if device.device_id in device_ids:
                attached_devices.append({
                    'device_id': device.device_id,
                    'model_name': '',
                    'data': {
                        'cores': len(device_ids)
                    },
                })
        return attached_devices
示例#7
0
class AbstractComputePlugin(AbstractPlugin, metaclass=ABCMeta):

    key: DeviceName = DeviceName('accelerator')
    slot_types: Sequence[Tuple[SlotName, SlotTypes]]
    exclusive_slot_types: Set[str]

    @abstractmethod
    async def list_devices(self) -> Collection[AbstractComputeDevice]:
        """
        Return the list of accelerator devices, as read as physically
        on the host.
        """
        raise NotImplementedError

    @abstractmethod
    async def available_slots(self) -> Mapping[SlotName, Decimal]:
        """
        Return available slot amounts for each slot key.
        """
        raise NotImplementedError

    @abstractmethod
    def get_version(self) -> str:
        """
        Return the version string of the plugin.
        """
        raise NotImplementedError

    @abstractmethod
    async def extra_info(self) -> Mapping[str, str]:
        """
        Return extra information related to this plugin,
        such as the underlying driver version and feature flags.
        """
        return {}

    @abstractmethod
    async def gather_node_measures(
            self, ctx: StatContext) -> Sequence[NodeMeasurement]:
        """
        Return the system-level and device-level statistic metrics.

        It may return any number of metrics using different statistics key names in the
        returning map.
        Note that the key must not conflict with other accelerator plugins and must not
        contain dots.
        """
        raise NotImplementedError

    @abstractmethod
    async def gather_container_measures(
        self,
        ctx: StatContext,
        container_ids: Sequence[str],
    ) -> Sequence[ContainerMeasurement]:
        """
        Return the container-level statistic metrics.
        """
        raise NotImplementedError

    @abstractmethod
    async def create_alloc_map(self) -> 'AbstractAllocMap':
        """
        Create and return an allocation map for this plugin.
        """
        raise NotImplementedError

    @abstractmethod
    async def get_hooks(self, distro: str, arch: str) -> Sequence[Path]:
        """
        Return the library hook paths used by the plugin (optional).

        :param str distro: The target Linux distribution such as "ubuntu16.04" or
                           "alpine3.8"
        :param str arch: The target CPU architecture such as "amd64"
        """
        return []

    @abstractmethod
    async def generate_docker_args(
        self,
        docker: aiodocker.docker.Docker,
        device_alloc,
    ) -> Mapping[str, Any]:
        """
        When starting a new container, generate device-specific options for the
        docker container create API as a dictionary, referring the given allocation
        map.  The agent will merge it with its own options.
        """
        return {}

    async def generate_resource_data(self, device_alloc) -> Mapping[str, str]:
        """
        Generate extra resource.txt key-value pair sets to be used by the plugin's
        own hook libraries in containers.
        """
        return {}

    @abstractmethod
    async def restore_from_container(
        self,
        container: SessionContainer,
        alloc_map: AbstractAllocMap,
    ) -> None:
        """
        When the agent restarts, retore the allocation map from the container
        metadata dictionary fetched from aiodocker.
        """
        pass

    @abstractmethod
    async def get_attached_devices(
        self,
        device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]],
    ) -> Sequence[DeviceModelInfo]:
        """
        Make up container-attached device information with allocated device id.
        """
        return []

    async def get_node_hwinfo(self) -> HardwareMetadata:
        raise NotImplementedError