async def api_impl(container_id): container = DockerContainer(ctx.agent.docker, id=container_id) ret = await fetch_api_stats(container) if ret is None: return None cpu_used = nmget(ret, 'cpu_stats.cpu_usage.total_usage', 0) / 1e6 return cpu_used
async def api_impl(container_id): container = DockerContainer(ctx.agent.docker, id=container_id) ret = await fetch_api_stats(container) if ret is None: return None mem_cur_bytes = nmget(ret, 'memory_stats.usage', 0) io_read_bytes = 0 io_write_bytes = 0 for item in nmget(ret, 'blkio_stats.io_service_bytes_recursive', []): if item['op'] == 'Read': io_read_bytes += item['value'] elif item['op'] == 'Write': io_write_bytes += item['value'] loop = current_loop() scratch_sz = await loop.run_in_executor(None, get_scratch_size, container_id) return mem_cur_bytes, io_read_bytes, io_write_bytes, scratch_sz
async def _collect_stats_api(container): try: ret = await container.stats(stream=False) except (DockerError, aiohttp.ClientResponseError): short_cid = container._id[:7] log.warning( f'cannot read stats: Docker stats API error for {short_cid}.') return None else: # API returned successfully but actually the result may be empty! if ret is None: return None if ret['preread'].startswith('0001-01-01'): return None cpu_used = nmget(ret, 'cpu_stats.cpu_usage.total_usage', 0) / 1e6 mem_max_bytes = nmget(ret, 'memory_stats.max_usage', 0) mem_cur_bytes = nmget(ret, 'memory_stats.usage', 0) io_read_bytes = 0 io_write_bytes = 0 for item in nmget(ret, 'blkio_stats.io_service_bytes_recursive', []): if item['op'] == 'Read': io_read_bytes += item['value'] elif item['op'] == 'Write': io_write_bytes += item['value'] io_max_scratch_size = 0 io_cur_scratch_size = 0 net_rx_bytes = 0 net_tx_bytes = 0 for dev in nmget(ret, 'networks', {}).values(): net_rx_bytes += dev['rx_bytes'] net_tx_bytes += dev['tx_bytes'] return ContainerStat( cpu_used, mem_max_bytes, mem_cur_bytes, net_rx_bytes, net_tx_bytes, io_read_bytes, io_write_bytes, io_max_scratch_size, io_cur_scratch_size, )
async def __ainit__(self) -> None: # Start serving requests. await self.update_status('starting') if not self.skip_detect_manager: await self.detect_manager() await self.read_agent_config() await self.read_agent_config_container() self.stats_monitor = StatsPluginContext(self.etcd, self.local_config) self.error_monitor = ErrorPluginContext(self.etcd, self.local_config) await self.stats_monitor.init() await self.error_monitor.init() backend = self.local_config['agent']['backend'] agent_mod = importlib.import_module(f"ai.backend.agent.{backend.value}") self.agent = await agent_mod.get_agent_cls().new( # type: ignore self.etcd, self.local_config, stats_monitor=self.stats_monitor, error_monitor=self.error_monitor, ) rpc_addr = self.local_config['agent']['rpc-listen-addr'] self.rpc_server = Peer( bind=ZeroMQAddress(f"tcp://{rpc_addr}"), transport=ZeroMQRPCTransport, scheduler=ExitOrderedAsyncScheduler(), serializer=msgpack.packb, deserializer=msgpack.unpackb, debug_rpc=self.local_config['debug']['enabled'], ) for func_name in self.rpc_function.functions: self.rpc_server.handle_function(func_name, getattr(self, func_name)) log.info('started handling RPC requests at {}', rpc_addr) await self.etcd.put('ip', rpc_addr.host, scope=ConfigScopes.NODE) watcher_port = utils.nmget(self.local_config, 'watcher.service-addr.port', None) if watcher_port is not None: await self.etcd.put('watcher_port', watcher_port, scope=ConfigScopes.NODE) await self.update_status('running')
async def get_time_binned_monthly_stats(request, user_uuid=None): ''' Generate time-binned (15 min) stats for the last one month (2880 points). The structure of the result would be: [ # [ # timestamp, num_sessions, # cpu_allocated, mem_allocated, gpu_allocated, # io_read, io_write, scratch_used, # ] [1562083808.657106, 1, 1.2, 1073741824, ...], [1562084708.657106, 2, 4.0, 1073741824, ...], ] Note that the timestamp is in UNIX-timestamp. ''' # Get all or user kernels for the last month from DB. time_window = 900 # 15 min now = datetime.now(tzutc()) start_date = now - timedelta(days=30) async with request.app['dbpool'].acquire() as conn, conn.begin(): query = (sa.select([kernels]).select_from(kernels).where( (kernels.c.terminated_at >= start_date) & (kernels.c.status.in_(RESOURCE_USAGE_KERNEL_STATUSES))).order_by( sa.asc(kernels.c.created_at))) if user_uuid is not None: query = query.where(kernels.c.user_uuid == user_uuid) result = await conn.execute(query) rows = await result.fetchall() # Build time-series of time-binned stats. rowcount = result.rowcount now = now.timestamp() start_date = start_date.timestamp() ts = start_date idx = 0 tseries = [] # Iterate over each time window. while ts < now: # Initialize the time-binned stats. num_sessions = 0 cpu_allocated = 0 mem_allocated = 0 gpu_allocated = 0 io_read_bytes = 0 io_write_bytes = 0 disk_used = 0 # Accumulate stats for containers overlapping with this time window. while idx < rowcount and \ ts + time_window > rows[idx].created_at.timestamp() and \ ts < rows[idx].terminated_at.timestamp(): # Accumulate stats for overlapping containers in this time window. row = rows[idx] num_sessions += 1 cpu_allocated += float(row.occupied_slots.get('cpu', 0)) mem_allocated += float(row.occupied_slots.get('mem', 0)) if 'cuda.devices' in row.occupied_slots: gpu_allocated += float(row.occupied_slots['cuda.devices']) if 'cuda.shares' in row.occupied_slots: gpu_allocated += float(row.occupied_slots['cuda.shares']) if row.last_stat: io_read_bytes += int(nmget(row.last_stat, 'io_read.current', 0)) io_write_bytes += int( nmget(row.last_stat, 'io_write.current', 0)) disk_used += int( nmget(row.last_stat, 'io_scratch_size/stats.max', {}, '/')) idx += 1 stat = { "date": ts, "num_sessions": { "value": num_sessions, "unit_hint": "count" }, "cpu_allocated": { "value": cpu_allocated, "unit_hint": "count" }, "mem_allocated": { "value": mem_allocated, "unit_hint": "bytes" }, "gpu_allocated": { "value": gpu_allocated, "unit_hint": "count" }, "io_read_bytes": { "value": io_read_bytes, "unit_hint": "bytes" }, "io_write_bytes": { "value": io_write_bytes, "unit_hint": "bytes" }, "disk_used": { "value ": disk_used, "unit_hint": "bytes" } } tseries.append(stat) ts += time_window return tseries
async def get_container_stats_for_period(request, start_date, end_date, group_ids=None): async with request.app['dbpool'].acquire() as conn, conn.begin(): j = (kernels.join(groups, groups.c.id == kernels.c.group_id).join( users, users.c.uuid == kernels.c.user_uuid)) query = ( sa.select([ kernels, groups.c.name, users.c.email ]).select_from(j).where( # Filter sessions which existence period overlaps with requested period ((kernels.c.terminated_at >= start_date) & (kernels.c.created_at < end_date) & (kernels.c.status.in_(RESOURCE_USAGE_KERNEL_STATUSES))) | # Or, filter running sessions which created before requested end_date ((kernels.c.created_at < end_date) & (kernels.c.status.in_(LIVE_STATUS)))).order_by( sa.asc(kernels.c.terminated_at))) if group_ids: query = query.where(kernels.c.group_id.in_(group_ids)) result = await conn.execute(query) rows = await result.fetchall() objs_per_group = {} local_tz = request.app['config']['system']['timezone'] for row in rows: group_id = str(row.group_id) last_stat = row.last_stat nfs = None if row.mounts is not None: nfs = list(set([mount[1] for mount in row.mounts])) if row['terminated_at'] is None: used_time = used_days = None else: used_time = str(row['terminated_at'] - row['created_at']) used_days = ( row['terminated_at'].astimezone(local_tz).toordinal() - row['created_at'].astimezone(local_tz).toordinal() + 1) device_type = set() smp = 0 gpu_mem_allocated = 0 if row.attached_devices and row.attached_devices.get('cuda'): for dev_info in row.attached_devices['cuda']: if dev_info.get('model_name'): device_type.add(dev_info['model_name']) smp += dev_info.get('smp', 0) gpu_mem_allocated += dev_info.get('mem', 0) gpu_allocated = 0 if 'cuda.devices' in row.occupied_slots: gpu_allocated = row.occupied_slots['cuda.devices'] if 'cuda.shares' in row.occupied_slots: gpu_allocated = row.occupied_slots['cuda.shares'] c_info = { 'id': str(row['id']), 'container_id': row['container_id'], 'domain_name': row['domain_name'], 'group_id': str(row['group_id']), 'group_name': row['name'], 'name': row['sess_id'], 'access_key': row['access_key'], 'email': row['email'], 'agent': row['agent'], 'cpu_allocated': float(row.occupied_slots.get('cpu', 0)), 'cpu_used': float(nmget(last_stat, 'cpu_used.current', 0)), 'mem_allocated': int(row.occupied_slots.get('mem', 0)), 'mem_used': int(nmget(last_stat, 'mem.capacity', 0)), 'shared_memory': int(nmget(row.resource_opts, 'shmem', 0)), 'disk_allocated': 0, # TODO: disk quota limit 'disk_used': (int(nmget(last_stat, 'io_scratch_size/stats.max', 0, '/'))), 'io_read': int(nmget(last_stat, 'io_read.current', 0)), 'io_write': int(nmget(last_stat, 'io_write.current', 0)), 'used_time': used_time, 'used_days': used_days, 'device_type': list(device_type), 'smp': float(smp), 'gpu_mem_allocated': float(gpu_mem_allocated), 'gpu_allocated': float(gpu_allocated), # devices or shares 'nfs': nfs, 'image_id': row['image'], # TODO: image id 'image_name': row['image'], 'created_at': str(row['created_at']), 'terminated_at': str(row['terminated_at']), 'status': row['status'].name, 'status_changed': str(row['status_changed']), } if group_id not in objs_per_group: objs_per_group[group_id] = { 'domain_name': row['domain_name'], 'g_id': group_id, 'g_name': row['name'], # this is group's name 'g_cpu_allocated': c_info['cpu_allocated'], 'g_cpu_used': c_info['cpu_used'], 'g_mem_allocated': c_info['mem_allocated'], 'g_mem_used': c_info['mem_used'], 'g_shared_memory': c_info['shared_memory'], 'g_disk_allocated': c_info['disk_allocated'], 'g_disk_used': c_info['disk_used'], 'g_io_read': c_info['io_read'], 'g_io_write': c_info['io_write'], 'g_device_type': copy.deepcopy(c_info['device_type']), 'g_smp': c_info['smp'], 'g_gpu_mem_allocated': c_info['gpu_mem_allocated'], 'g_gpu_allocated': c_info['gpu_allocated'], 'c_infos': [c_info], } else: objs_per_group[group_id]['g_cpu_allocated'] += c_info[ 'cpu_allocated'] objs_per_group[group_id]['g_cpu_used'] += c_info['cpu_used'] objs_per_group[group_id]['g_mem_allocated'] += c_info[ 'mem_allocated'] objs_per_group[group_id]['g_mem_used'] += c_info['mem_used'] objs_per_group[group_id]['g_shared_memory'] += c_info[ 'shared_memory'] objs_per_group[group_id]['g_disk_allocated'] += c_info[ 'disk_allocated'] objs_per_group[group_id]['g_disk_used'] += c_info['disk_used'] objs_per_group[group_id]['g_io_read'] += c_info['io_read'] objs_per_group[group_id]['g_io_write'] += c_info['io_write'] for device in c_info['device_type']: if device not in objs_per_group[group_id]['g_device_type']: g_dev_type = objs_per_group[group_id]['g_device_type'] g_dev_type.append(device) objs_per_group[group_id]['g_device_type'] = list( set(g_dev_type)) objs_per_group[group_id]['g_smp'] += c_info['smp'] objs_per_group[group_id]['g_gpu_mem_allocated'] += c_info[ 'gpu_mem_allocated'] objs_per_group[group_id]['g_gpu_allocated'] += c_info[ 'gpu_allocated'] objs_per_group[group_id]['c_infos'].append(c_info) return list(objs_per_group.values())