async def restore_from_container( self, container: Container, alloc_map: AbstractAllocMap, ) -> None: if not self.enabled: return resource_spec = await get_resource_spec_from_container(container.backend_obj) if resource_spec is None: return if hasattr(alloc_map, 'apply_allocation'): alloc_map.apply_allocation({ SlotName('cuda.device'): resource_spec.allocations.get( DeviceName('cuda'), {} ).get( SlotName('cuda.device'), {} ), }) else: alloc_map.allocations[SlotName('cuda.device')].update( resource_spec.allocations.get( DeviceName('cuda'), {} ).get( SlotName('cuda.device'), {} ) )
async def restore_from_container( self, container: Container, alloc_map: AbstractAllocMap, ) -> None: assert isinstance(alloc_map, DiscretePropertyAllocMap) # Docker does not return the original cpuset.... :( # We need to read our own records. resource_spec = await get_resource_spec_from_container( container.backend_obj) if resource_spec is None: return alloc_map.apply_allocation({ SlotName('cpu'): resource_spec.allocations[DeviceName('cpu')][SlotName('cpu')], })
def read_from_string(cls, text: str) -> 'KernelResourceSpec': kvpairs = {} for line in text.split('\n'): if '=' not in line: continue key, val = line.strip().split('=', maxsplit=1) kvpairs[key] = val allocations = cast( MutableMapping[DeviceName, MutableMapping[SlotName, Mapping[DeviceId, Decimal]]], defaultdict(lambda: defaultdict(Decimal)), ) for key, val in kvpairs.items(): if key.endswith('_SHARES'): slot_name = SlotName(key[:-7].lower()) device_name = DeviceName(slot_name.split('.')[0]) per_device_alloc: MutableMapping[DeviceId, Decimal] = {} for entry in val.split(','): raw_dev_id, _, raw_alloc = entry.partition(':') if not raw_dev_id or not raw_alloc: continue dev_id = DeviceId(raw_dev_id) try: if known_slot_types.get(slot_name, 'count') == 'bytes': alloc = Decimal(BinarySize.from_str(raw_alloc)) else: alloc = Decimal(raw_alloc) except KeyError as e: log.warning( 'A previously launched container has ' 'unknown slot type: {}. Ignoring it.', e.args[0]) continue per_device_alloc[dev_id] = alloc allocations[device_name][slot_name] = per_device_alloc mounts = [Mount.from_str(m) for m in kvpairs['MOUNTS'].split(',') if m] return cls( container_id=kvpairs.get('CID', 'unknown'), scratch_disk_size=BinarySize.finite_from_str( kvpairs['SCRATCH_SIZE']), allocations=dict(allocations), slots=ResourceSlot(json.loads(kvpairs['SLOTS'])), mounts=mounts, )
class CUDAPlugin(AbstractComputePlugin): config_watch_enabled = False key = DeviceName('cuda') slot_types: Sequence[Tuple[SlotName, SlotTypes]] = ( (SlotName('cuda.device'), SlotTypes('count')), ) nvdocker_version: Tuple[int, ...] = (0, 0, 0) docker_version: Tuple[int, ...] = (0, 0, 0) device_mask: Sequence[DeviceId] = [] enabled: bool = True async def init(self, context: Any = None) -> None: rx_triple_version = re.compile(r'(\d+\.\d+\.\d+)') # Check nvidia-docker and docker versions try: proc = await asyncio.create_subprocess_exec( 'nvidia-docker', 'version', '-f', '{{json .}}', stdout=asyncio.subprocess.PIPE, ) stdout, _ = await proc.communicate() lines = stdout.decode().splitlines() except FileNotFoundError: log.warning('nvidia-docker is not installed.') log.info('CUDA acceleration is disabled.') self.enabled = False return m = rx_triple_version.search(lines[0]) if m: self.nvdocker_version = tuple(map(int, m.group(1).split('.'))) else: log.error('could not detect nvidia-docker version!') log.info('CUDA acceleration is disabled.') self.enabled = False return docker_version_data = json.loads(lines[1]) m = rx_triple_version.search(docker_version_data['Server']['Version']) if m: self.docker_version = tuple(map(int, m.group(1).split('.'))) else: log.error('could not detect docker version!') log.info('CUDA acceleration is disabled.') self.enabled = False return raw_device_mask = self.plugin_config.get('device_mask') if raw_device_mask is not None: self.device_mask = [ *map(lambda dev_id: DeviceId(dev_id), raw_device_mask.split(',')) ] try: detected_devices = await self.list_devices() log.info('detected devices:\n' + pformat(detected_devices)) log.info('nvidia-docker version: {}', self.nvdocker_version) log.info('CUDA acceleration is enabled.') except ImportError: log.warning('could not load the CUDA runtime library.') log.info('CUDA acceleration is disabled.') self.enabled = False except RuntimeError as e: log.warning('CUDA init error: {}', e) log.info('CUDA acceleration is disabled.') self.enabled = False async def cleanup(self) -> None: pass async def update_plugin_config( self, new_plugin_config: Mapping[str, Any], ) -> None: pass async def list_devices(self) -> Collection[CUDADevice]: if not self.enabled: return [] all_devices = [] num_devices = libcudart.get_device_count() for dev_id in map(lambda idx: DeviceId(str(idx)), range(num_devices)): if dev_id in self.device_mask: continue raw_info = libcudart.get_device_props(int(dev_id)) sysfs_node_path = "/sys/bus/pci/devices/" \ f"{raw_info['pciBusID_str'].lower()}/numa_node" node: Optional[int] try: node = int(Path(sysfs_node_path).read_text().strip()) except OSError: node = None dev_uuid, raw_dev_uuid = None, raw_info.get('uuid', None) if raw_dev_uuid is not None: dev_uuid = str(uuid.UUID(bytes=raw_dev_uuid)) else: dev_uuid = '00000000-0000-0000-0000-000000000000' dev_info = CUDADevice( device_id=dev_id, hw_location=raw_info['pciBusID_str'], numa_node=node, memory_size=raw_info['totalGlobalMem'], processing_units=raw_info['multiProcessorCount'], model_name=raw_info['name'], uuid=dev_uuid, ) all_devices.append(dev_info) return all_devices async def available_slots(self) -> Mapping[SlotName, Decimal]: devices = await self.list_devices() return { SlotName('cuda.device'): Decimal(len(devices)), } def get_version(self) -> str: return __version__ async def extra_info(self) -> Mapping[str, Any]: if self.enabled: try: return { 'cuda_support': True, 'nvidia_version': libnvml.get_driver_version(), 'cuda_version': '{0[0]}.{0[1]}'.format(libcudart.get_version()), } except ImportError: log.warning('extra_info(): NVML/CUDA runtime library is not found') except LibraryError as e: log.warning('extra_info(): {!r}', e) return { 'cuda_support': False, } async def gather_node_measures( self, ctx: StatContext, ) -> Sequence[NodeMeasurement]: dev_count = 0 mem_avail_total = 0 mem_used_total = 0 mem_stats = {} util_total = 0 util_stats = {} if self.enabled: try: dev_count = libnvml.get_device_count() for dev_id in map(lambda idx: DeviceId(str(idx)), range(dev_count)): if dev_id in self.device_mask: continue dev_stat = libnvml.get_device_stats(int(dev_id)) mem_avail_total += dev_stat.mem_total mem_used_total += dev_stat.mem_used mem_stats[dev_id] = Measurement(Decimal(dev_stat.mem_used), Decimal(dev_stat.mem_total)) util_total += dev_stat.gpu_util util_stats[dev_id] = Measurement(Decimal(dev_stat.gpu_util), Decimal(100)) except ImportError: log.warning('gather_node_measure(): NVML library is not found') except LibraryError as e: log.warning('gather_node_measure(): {!r}', e) return [ NodeMeasurement( MetricKey('cuda_mem'), MetricTypes.USAGE, unit_hint='bytes', stats_filter=frozenset({'max'}), per_node=Measurement(Decimal(mem_used_total), Decimal(mem_avail_total)), per_device=mem_stats, ), NodeMeasurement( MetricKey('cuda_util'), MetricTypes.USAGE, unit_hint='percent', stats_filter=frozenset({'avg', 'max'}), per_node=Measurement(Decimal(util_total), Decimal(dev_count * 100)), per_device=util_stats, ), ] async def gather_container_measures( self, ctx: StatContext, container_ids: Sequence[str], ) -> Sequence[ContainerMeasurement]: return [] async def create_alloc_map(self) -> AbstractAllocMap: devices = await self.list_devices() return DiscretePropertyAllocMap( device_slots={ dev.device_id: ( DeviceSlotInfo(SlotTypes.COUNT, SlotName('cuda.device'), Decimal(1)) ) for dev in devices }, ) async def get_hooks(self, distro: str, arch: str) -> Sequence[Path]: return [] async def generate_docker_args( self, docker: aiodocker.Docker, device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]], ) -> Mapping[str, Any]: if not self.enabled: return {} assigned_device_ids = [] for slot_type, per_device_alloc in device_alloc.items(): for device_id, alloc in per_device_alloc.items(): if alloc > 0: assigned_device_ids.append(device_id) if self.nvdocker_version[0] == 1: timeout = aiohttp.ClientTimeout(total=3) async with aiohttp.ClientSession(raise_for_status=True, timeout=timeout) as sess: try: nvdocker_url = 'http://localhost:3476/docker/cli/json' async with sess.get(nvdocker_url) as resp: nvidia_params = await resp.json() except aiohttp.ClientError: raise RuntimeError('NVIDIA Docker plugin is not available.') volumes = await docker.volumes.list() existing_volumes = set(vol['Name'] for vol in volumes['Volumes']) required_volumes = set(vol.split(':')[0] for vol in nvidia_params['Volumes']) missing_volumes = required_volumes - existing_volumes binds = [] for vol_name in missing_volumes: for vol_param in nvidia_params['Volumes']: if vol_param.startswith(vol_name + ':'): _, _, permission = vol_param.split(':') driver = nvidia_params['VolumeDriver'] await docker.volumes.create({ 'Name': vol_name, 'Driver': driver, }) for vol_name in required_volumes: for vol_param in nvidia_params['Volumes']: if vol_param.startswith(vol_name + ':'): _, mount_pt, permission = vol_param.split(':') binds.append('{}:{}:{}'.format( vol_name, mount_pt, permission)) devices = [] for dev in nvidia_params['Devices']: m = re.search(r'^/dev/nvidia(\d+)$', dev) if m is None: # Always add non-GPU device files required by the driver. # (e.g., nvidiactl, nvidia-uvm, ... etc.) devices.append(dev) continue device_id = m.group(1) if device_id not in assigned_device_ids: continue devices.append(dev) devices = [{ 'PathOnHost': dev, 'PathInContainer': dev, 'CgroupPermissions': 'mrw', } for dev in devices] return { 'HostConfig': { 'Binds': binds, 'Devices': devices, }, } elif self.nvdocker_version[0] == 2: device_list_str = ','.join(sorted(assigned_device_ids)) if self.docker_version >= (19, 3, 0): docker_config: Dict[str, Any] = {} if assigned_device_ids: docker_config.update({ 'HostConfig': { 'DeviceRequests': [ { "Driver": "nvidia", "DeviceIDs": assigned_device_ids, # "all" does not work here "Capabilities": [ ["utility", "compute", "video", "graphics", "display"] ], }, ], }, }) return docker_config else: return { 'HostConfig': { 'Runtime': 'nvidia', }, 'Env': [ f"NVIDIA_VISIBLE_DEVICES={device_list_str}", ], } else: raise RuntimeError('BUG: should not be reached here!') async def get_attached_devices( self, device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]], ) -> Sequence[DeviceModelInfo]: device_ids: List[DeviceId] = [] if SlotName('cuda.devices') in device_alloc: device_ids.extend(device_alloc[SlotName('cuda.devices')].keys()) available_devices = await self.list_devices() attached_devices: List[DeviceModelInfo] = [] for device in available_devices: if device.device_id in device_ids: proc = device.processing_units mem = BinarySize(device.memory_size) attached_devices.append({ # TODO: update common.types.DeviceModelInfo 'device_id': device.device_id, 'model_name': device.model_name, 'smp': proc, 'mem': mem, }) return attached_devices async def restore_from_container( self, container: Container, alloc_map: AbstractAllocMap, ) -> None: if not self.enabled: return resource_spec = await get_resource_spec_from_container(container.backend_obj) if resource_spec is None: return if hasattr(alloc_map, 'apply_allocation'): alloc_map.apply_allocation({ SlotName('cuda.device'): resource_spec.allocations.get( DeviceName('cuda'), {} ).get( SlotName('cuda.device'), {} ), }) else: alloc_map.allocations[SlotName('cuda.device')].update( resource_spec.allocations.get( DeviceName('cuda'), {} ).get( SlotName('cuda.device'), {} ) ) async def generate_resource_data( self, device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]], ) -> Mapping[str, str]: data: MutableMapping[str, str] = {} if not self.enabled: return data active_device_id_set: Set[DeviceId] = set() for slot_type, per_device_alloc in device_alloc.items(): for dev_id, alloc in per_device_alloc.items(): if alloc > 0: active_device_id_set.add(dev_id) active_device_ids = sorted(active_device_id_set, key=lambda v: int(v)) data['CUDA_GLOBAL_DEVICE_IDS'] = ','.join( f'{local_idx}:{global_id}' for local_idx, global_id in enumerate(active_device_ids)) return data
class MemoryPlugin(AbstractComputePlugin): """ Represents the main memory. When collecting statistics, it also measures network and I/O usage in addition to the memory usage. """ config_watch_enabled = False key = DeviceName('mem') slot_types = [(SlotName('mem'), SlotTypes.BYTES)] async def init(self, context: Any = None) -> None: pass async def cleanup(self) -> None: pass async def update_plugin_config( self, new_plugin_config: Mapping[str, Any]) -> None: pass async def list_devices(self) -> Collection[MemoryDevice]: # TODO: support NUMA? memory_size = psutil.virtual_memory().total return [ MemoryDevice( device_id=DeviceId('root'), hw_location='root', numa_node=0, memory_size=memory_size, processing_units=0, ) ] async def available_slots(self) -> Mapping[SlotName, Decimal]: devices = await self.list_devices() return { SlotName('mem'): Decimal(sum(dev.memory_size for dev in devices)), } def get_version(self) -> str: return __version__ async def extra_info(self) -> Mapping[str, str]: return {} async def gather_node_measures( self, ctx: StatContext) -> Sequence[NodeMeasurement]: _mstat = psutil.virtual_memory() total_mem_used_bytes = Decimal(_mstat.total - _mstat.available) total_mem_capacity_bytes = Decimal(_mstat.total) _nstat = psutil.net_io_counters() net_rx_bytes = _nstat.bytes_recv net_tx_bytes = _nstat.bytes_sent def get_disk_stat(): pruned_disk_types = frozenset(['squashfs', 'vfat', 'tmpfs']) total_disk_usage = Decimal(0) total_disk_capacity = Decimal(0) per_disk_stat = {} for disk_info in psutil.disk_partitions(): if disk_info.fstype not in pruned_disk_types: dstat = os.statvfs(disk_info.mountpoint) disk_usage = Decimal(dstat.f_frsize * (dstat.f_blocks - dstat.f_bavail)) disk_capacity = Decimal(dstat.f_frsize * dstat.f_blocks) per_disk_stat[disk_info.device] = Measurement( disk_usage, disk_capacity) total_disk_usage += disk_usage total_disk_capacity += disk_capacity return total_disk_usage, total_disk_capacity, per_disk_stat loop = current_loop() total_disk_usage, total_disk_capacity, per_disk_stat = \ await loop.run_in_executor(None, get_disk_stat) return [ NodeMeasurement( MetricKey('mem'), MetricTypes.USAGE, unit_hint='bytes', stats_filter=frozenset({'max'}), per_node=Measurement(total_mem_used_bytes, total_mem_capacity_bytes), per_device={ DeviceId('root'): Measurement(total_mem_used_bytes, total_mem_capacity_bytes) }, ), NodeMeasurement( MetricKey('disk'), MetricTypes.USAGE, unit_hint='bytes', per_node=Measurement(total_disk_usage, total_disk_capacity), per_device=per_disk_stat, ), NodeMeasurement( MetricKey('net_rx'), MetricTypes.RATE, unit_hint='bps', current_hook=lambda metric: metric.stats.rate, per_node=Measurement(Decimal(net_rx_bytes)), per_device={ DeviceId('node'): Measurement(Decimal(net_rx_bytes)) }, ), NodeMeasurement( MetricKey('net_tx'), MetricTypes.RATE, unit_hint='bps', current_hook=lambda metric: metric.stats.rate, per_node=Measurement(Decimal(net_tx_bytes)), per_device={ DeviceId('node'): Measurement(Decimal(net_tx_bytes)) }, ), ] async def gather_container_measures(self, ctx: StatContext, container_ids: Sequence[str]) \ -> Sequence[ContainerMeasurement]: def get_scratch_size(container_id: str) -> int: for kernel_id, info in ctx.agent.kernel_registry.items(): if info['container_id'] == container_id: break else: return 0 work_dir = ctx.agent.local_config['container'][ 'scratch-root'] / str(kernel_id) / 'work' total_size = 0 for path in work_dir.rglob('*'): if path.is_symlink(): total_size += path.lstat().st_size elif path.is_file(): total_size += path.stat().st_size return total_size async def sysfs_impl(container_id): mem_prefix = f'/sys/fs/cgroup/memory/docker/{container_id}/' io_prefix = f'/sys/fs/cgroup/blkio/docker/{container_id}/' try: mem_cur_bytes = read_sysfs( mem_prefix + 'memory.usage_in_bytes', int) io_stats = Path(io_prefix + 'blkio.throttle.io_service_bytes').read_text() # example data: # 8:0 Read 13918208 # 8:0 Write 0 # 8:0 Sync 0 # 8:0 Async 13918208 # 8:0 Total 13918208 # Total 13918208 io_read_bytes = 0 io_write_bytes = 0 for line in io_stats.splitlines(): if line.startswith('Total '): continue dev, op, nbytes = line.strip().split() if op == 'Read': io_read_bytes += int(nbytes) elif op == 'Write': io_write_bytes += int(nbytes) except IOError as e: log.warning( 'cannot read stats: sysfs unreadable for container {0}\n{1!r}', container_id[:7], e) return None loop = current_loop() scratch_sz = await loop.run_in_executor(None, get_scratch_size, container_id) return mem_cur_bytes, io_read_bytes, io_write_bytes, scratch_sz async def api_impl(container_id): container = DockerContainer(ctx.agent.docker, id=container_id) ret = await fetch_api_stats(container) if ret is None: return None mem_cur_bytes = nmget(ret, 'memory_stats.usage', 0) io_read_bytes = 0 io_write_bytes = 0 for item in nmget(ret, 'blkio_stats.io_service_bytes_recursive', []): if item['op'] == 'Read': io_read_bytes += item['value'] elif item['op'] == 'Write': io_write_bytes += item['value'] loop = current_loop() scratch_sz = await loop.run_in_executor(None, get_scratch_size, container_id) return mem_cur_bytes, io_read_bytes, io_write_bytes, scratch_sz if ctx.mode == StatModes.CGROUP: impl = sysfs_impl elif ctx.mode == StatModes.DOCKER: impl = api_impl else: raise RuntimeError("should not reach here") per_container_mem_used_bytes = {} per_container_io_read_bytes = {} per_container_io_write_bytes = {} per_container_io_scratch_size = {} tasks = [] for cid in container_ids: tasks.append(asyncio.ensure_future(impl(cid))) results = await asyncio.gather(*tasks) for cid, result in zip(container_ids, results): if result is None: continue per_container_mem_used_bytes[cid] = Measurement(Decimal(result[0])) per_container_io_read_bytes[cid] = Measurement(Decimal(result[1])) per_container_io_write_bytes[cid] = Measurement(Decimal(result[2])) per_container_io_scratch_size[cid] = Measurement(Decimal( result[3])) return [ ContainerMeasurement( MetricKey('mem'), MetricTypes.USAGE, unit_hint='bytes', stats_filter=frozenset({'max'}), per_container=per_container_mem_used_bytes, ), ContainerMeasurement( MetricKey('io_read'), MetricTypes.USAGE, unit_hint='bytes', stats_filter=frozenset({'rate'}), per_container=per_container_io_read_bytes, ), ContainerMeasurement( MetricKey('io_write'), MetricTypes.USAGE, unit_hint='bytes', stats_filter=frozenset({'rate'}), per_container=per_container_io_write_bytes, ), ContainerMeasurement( MetricKey('io_scratch_size'), MetricTypes.USAGE, unit_hint='bytes', stats_filter=frozenset({'max'}), per_container=per_container_io_scratch_size, ), ] async def create_alloc_map(self) -> AbstractAllocMap: devices = await self.list_devices() return DiscretePropertyAllocMap(device_slots={ dev.device_id: DeviceSlotInfo(SlotTypes.BYTES, SlotName('mem'), Decimal(dev.memory_size)) for dev in devices }, ) async def get_hooks(self, distro: str, arch: str) -> Sequence[Path]: return [] async def generate_docker_args( self, docker: Docker, device_alloc, ) -> Mapping[str, Any]: memory = sum(device_alloc['mem'].values()) return { 'HostConfig': { 'MemorySwap': int(memory), # prevent using swap! 'Memory': int(memory), } } async def restore_from_container( self, container: Container, alloc_map: AbstractAllocMap, ) -> None: assert isinstance(alloc_map, DiscretePropertyAllocMap) memory_limit = container.backend_obj['HostConfig']['Memory'] alloc_map.apply_allocation({ SlotName('mem'): { DeviceId('root'): memory_limit }, }) async def get_attached_devices( self, device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]], ) -> Sequence[DeviceModelInfo]: device_ids = [*device_alloc[SlotName('mem')].keys()] available_devices = await self.list_devices() attached_devices: List[DeviceModelInfo] = [] for device in available_devices: if device.device_id in device_ids: attached_devices.append({ 'device_id': device.device_id, 'model_name': '', 'data': {}, }) return attached_devices
class CPUPlugin(AbstractComputePlugin): """ Represents the CPU. """ config_watch_enabled = False key = DeviceName('cpu') slot_types = [(SlotName('cpu'), SlotTypes.COUNT)] async def init(self, context: Any = None) -> None: pass async def cleanup(self) -> None: pass async def update_plugin_config( self, new_plugin_config: Mapping[str, Any]) -> None: pass async def list_devices(self) -> Collection[CPUDevice]: cores = await libnuma.get_available_cores() overcommit_factor = int( os.environ.get('BACKEND_CPU_OVERCOMMIT_FACTOR', '1')) assert 1 <= overcommit_factor <= 4 return [ CPUDevice( device_id=DeviceId(str(core_idx)), hw_location='root', numa_node=libnuma.node_of_cpu(core_idx), memory_size=0, processing_units=1 * overcommit_factor, ) for core_idx in sorted(cores) ] async def available_slots(self) -> Mapping[SlotName, Decimal]: devices = await self.list_devices() return { SlotName('cpu'): Decimal(sum(dev.processing_units for dev in devices)), } def get_version(self) -> str: return __version__ async def extra_info(self) -> Mapping[str, str]: return { 'agent_version': __version__, 'machine': platform.machine(), 'os_type': platform.system(), } async def gather_node_measures( self, ctx: StatContext) -> Sequence[NodeMeasurement]: _cstat = psutil.cpu_times(True) q = Decimal('0.000') total_cpu_used = cast( Decimal, sum((Decimal(c.user + c.system) * 1000).quantize(q) for c in _cstat)) now, raw_interval = ctx.update_timestamp('cpu-node') interval = Decimal(raw_interval * 1000).quantize(q) return [ NodeMeasurement( MetricKey('cpu_util'), MetricTypes.UTILIZATION, unit_hint='msec', current_hook=lambda metric: metric.stats.diff, per_node=Measurement(total_cpu_used, interval), per_device={ DeviceId(str(idx)): Measurement( (Decimal(c.user + c.system) * 1000).quantize(q), interval, ) for idx, c in enumerate(_cstat) }, ), ] async def gather_container_measures( self, ctx: StatContext, container_ids: Sequence[str], ) -> Sequence[ContainerMeasurement]: async def sysfs_impl(container_id): cpu_prefix = f'/sys/fs/cgroup/cpuacct/docker/{container_id}/' try: cpu_used = read_sysfs(cpu_prefix + 'cpuacct.usage', int) / 1e6 except IOError as e: log.warning( 'cannot read stats: sysfs unreadable for container {0}\n{1!r}', container_id[:7], e) return None return cpu_used async def api_impl(container_id): container = DockerContainer(ctx.agent.docker, id=container_id) ret = await fetch_api_stats(container) if ret is None: return None cpu_used = nmget(ret, 'cpu_stats.cpu_usage.total_usage', 0) / 1e6 return cpu_used if ctx.mode == StatModes.CGROUP: impl = sysfs_impl elif ctx.mode == StatModes.DOCKER: impl = api_impl else: raise RuntimeError("should not reach here") q = Decimal('0.000') per_container_cpu_used = {} tasks = [] for cid in container_ids: tasks.append(asyncio.ensure_future(impl(cid))) results = await asyncio.gather(*tasks) for cid, cpu_used in zip(container_ids, results): if cpu_used is None: continue per_container_cpu_used[cid] = Measurement( Decimal(cpu_used).quantize(q)) return [ ContainerMeasurement( MetricKey('cpu_util'), MetricTypes.UTILIZATION, unit_hint='percent', current_hook=lambda metric: metric.stats.diff, stats_filter=frozenset({'avg', 'max'}), per_container=per_container_cpu_used, ), ContainerMeasurement( MetricKey('cpu_used'), MetricTypes.USAGE, unit_hint='msec', per_container=per_container_cpu_used.copy(), ), ] async def create_alloc_map(self) -> AbstractAllocMap: devices = await self.list_devices() return DiscretePropertyAllocMap(device_slots={ dev.device_id: DeviceSlotInfo(SlotTypes.COUNT, SlotName('cpu'), Decimal(dev.processing_units)) for dev in devices }, ) async def get_hooks(self, distro: str, arch: str) -> Sequence[Path]: # TODO: move the sysconf hook in libbaihook.so here return [] async def generate_docker_args( self, docker: Docker, device_alloc, ) -> Mapping[str, Any]: cores = [*map(int, device_alloc['cpu'].keys())] sorted_core_ids = [*map(str, sorted(cores))] return { 'HostConfig': { 'CpuPeriod': 100_000, # docker default 'CpuQuota': int(100_000 * len(cores)), 'Cpus': ','.join(sorted_core_ids), 'CpusetCpus': ','.join(sorted_core_ids), # 'CpusetMems': f'{resource_spec.numa_node}', } } async def restore_from_container( self, container: Container, alloc_map: AbstractAllocMap, ) -> None: assert isinstance(alloc_map, DiscretePropertyAllocMap) # Docker does not return the original cpuset.... :( # We need to read our own records. resource_spec = await get_resource_spec_from_container( container.backend_obj) if resource_spec is None: return alloc_map.apply_allocation({ SlotName('cpu'): resource_spec.allocations[DeviceName('cpu')][SlotName('cpu')], }) async def get_attached_devices( self, device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]], ) -> Sequence[DeviceModelInfo]: device_ids = [*device_alloc[SlotName('cpu')].keys()] available_devices = await self.list_devices() attached_devices: List[DeviceModelInfo] = [] for device in available_devices: if device.device_id in device_ids: attached_devices.append({ 'device_id': device.device_id, 'model_name': '', 'data': { 'cores': len(device_ids) }, }) return attached_devices
class AbstractComputePlugin(AbstractPlugin, metaclass=ABCMeta): key: DeviceName = DeviceName('accelerator') slot_types: Sequence[Tuple[SlotName, SlotTypes]] exclusive_slot_types: Set[str] @abstractmethod async def list_devices(self) -> Collection[AbstractComputeDevice]: """ Return the list of accelerator devices, as read as physically on the host. """ raise NotImplementedError @abstractmethod async def available_slots(self) -> Mapping[SlotName, Decimal]: """ Return available slot amounts for each slot key. """ raise NotImplementedError @abstractmethod def get_version(self) -> str: """ Return the version string of the plugin. """ raise NotImplementedError @abstractmethod async def extra_info(self) -> Mapping[str, str]: """ Return extra information related to this plugin, such as the underlying driver version and feature flags. """ return {} @abstractmethod async def gather_node_measures( self, ctx: StatContext) -> Sequence[NodeMeasurement]: """ Return the system-level and device-level statistic metrics. It may return any number of metrics using different statistics key names in the returning map. Note that the key must not conflict with other accelerator plugins and must not contain dots. """ raise NotImplementedError @abstractmethod async def gather_container_measures( self, ctx: StatContext, container_ids: Sequence[str], ) -> Sequence[ContainerMeasurement]: """ Return the container-level statistic metrics. """ raise NotImplementedError @abstractmethod async def create_alloc_map(self) -> 'AbstractAllocMap': """ Create and return an allocation map for this plugin. """ raise NotImplementedError @abstractmethod async def get_hooks(self, distro: str, arch: str) -> Sequence[Path]: """ Return the library hook paths used by the plugin (optional). :param str distro: The target Linux distribution such as "ubuntu16.04" or "alpine3.8" :param str arch: The target CPU architecture such as "amd64" """ return [] @abstractmethod async def generate_docker_args( self, docker: aiodocker.docker.Docker, device_alloc, ) -> Mapping[str, Any]: """ When starting a new container, generate device-specific options for the docker container create API as a dictionary, referring the given allocation map. The agent will merge it with its own options. """ return {} async def generate_resource_data(self, device_alloc) -> Mapping[str, str]: """ Generate extra resource.txt key-value pair sets to be used by the plugin's own hook libraries in containers. """ return {} @abstractmethod async def restore_from_container( self, container: SessionContainer, alloc_map: AbstractAllocMap, ) -> None: """ When the agent restarts, retore the allocation map from the container metadata dictionary fetched from aiodocker. """ pass @abstractmethod async def get_attached_devices( self, device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]], ) -> Sequence[DeviceModelInfo]: """ Make up container-attached device information with allocated device id. """ return [] async def get_node_hwinfo(self) -> HardwareMetadata: raise NotImplementedError