def from_row(cls, context, row): if row is None: return None mega = 2**20 return cls( id=row['id'], status=row['status'].name, region=row['region'], scaling_group=row['scaling_group'], available_slots=row['available_slots'].to_json(), occupied_slots=row['occupied_slots'].to_json(), addr=row['addr'], first_contact=row['first_contact'], lost_at=row['lost_at'], version=row['version'], compute_plugins=row['compute_plugins'], # legacy fields mem_slots=BinarySize.from_str(row['available_slots']['mem']) // mega, cpu_slots=row['available_slots']['cpu'], gpu_slots=row['available_slots'].get('cuda.device', 0), tpu_slots=row['available_slots'].get('tpu.device', 0), used_mem_slots=BinarySize.from_str(row['occupied_slots'].get( 'mem', 0)) // mega, used_cpu_slots=float(row['occupied_slots'].get('cpu', 0)), used_gpu_slots=float(row['occupied_slots'].get('cuda.device', 0)), used_tpu_slots=float(row['occupied_slots'].get('tpu.device', 0)), )
def from_row( cls, context: Mapping[str, Any], row: RowProxy, ) -> Agent: mega = 2 ** 20 return cls( id=row['id'], status=row['status'].name, status_changed=row['status_changed'], region=row['region'], scaling_group=row['scaling_group'], schedulable=row['schedulable'], available_slots=row['available_slots'].to_json(), occupied_slots=row['occupied_slots'].to_json(), addr=row['addr'], first_contact=row['first_contact'], lost_at=row['lost_at'], version=row['version'], compute_plugins=row['compute_plugins'], # legacy fields mem_slots=BinarySize.from_str(row['available_slots']['mem']) // mega, cpu_slots=row['available_slots']['cpu'], gpu_slots=row['available_slots'].get('cuda.device', 0), tpu_slots=row['available_slots'].get('tpu.device', 0), used_mem_slots=BinarySize.from_str( row['occupied_slots'].get('mem', 0)) // mega, used_cpu_slots=float(row['occupied_slots'].get('cpu', 0)), used_gpu_slots=float(row['occupied_slots'].get('cuda.device', 0)), used_tpu_slots=float(row['occupied_slots'].get('tpu.device', 0)), )
async def get_fs_usage(self) -> FSUsage: loop = asyncio.get_running_loop() stat = await loop.run_in_executor(None, os.statvfs, self.mount_path) return FSUsage( capacity_bytes=BinarySize(stat.f_frsize * stat.f_blocks), used_bytes=BinarySize(stat.f_frsize * (stat.f_blocks - stat.f_bavail)), )
async def test_xfs_quota(xfs): vfid = uuid.uuid4() options = {"quota": BinarySize.from_str("10m")} await xfs.create_vfolder(vfid, options=options) vfpath = xfs.mount_path / vfid.hex[0:2] / vfid.hex[2:4] / vfid.hex[4:] assert vfpath.is_dir() assert await xfs.get_quota(vfid) == BinarySize.from_str("10m") await xfs.set_quota(vfid, BinarySize.from_str("1m")) assert await xfs.get_quota(vfid) == BinarySize.from_str("1m") await xfs.delete_vfolder(vfid) assert not vfpath.is_dir()
def process_result_value(self, raw_value: Dict[str, str], dialect): # legacy handling interim_value: Dict[str, Any] = raw_value mem = raw_value.get('mem') if isinstance(mem, str) and not mem.isdigit(): interim_value['mem'] = BinarySize.from_str(mem) return ResourceSlot.from_json( interim_value) if raw_value is not None else None
def read_from_string(cls, text: str) -> 'KernelResourceSpec': kvpairs = {} for line in text.split('\n'): if '=' not in line: continue key, val = line.strip().split('=', maxsplit=1) kvpairs[key] = val allocations = cast( MutableMapping[DeviceName, MutableMapping[SlotName, Mapping[DeviceId, Decimal]]], defaultdict(lambda: defaultdict(Decimal)), ) for key, val in kvpairs.items(): if key.endswith('_SHARES'): slot_name = SlotName(key[:-7].lower()) device_name = DeviceName(slot_name.split('.')[0]) per_device_alloc: MutableMapping[DeviceId, Decimal] = {} for entry in val.split(','): raw_dev_id, _, raw_alloc = entry.partition(':') if not raw_dev_id or not raw_alloc: continue dev_id = DeviceId(raw_dev_id) try: if known_slot_types.get(slot_name, 'count') == 'bytes': alloc = Decimal(BinarySize.from_str(raw_alloc)) else: alloc = Decimal(raw_alloc) except KeyError as e: log.warning( 'A previously launched container has ' 'unknown slot type: {}. Ignoring it.', e.args[0]) continue per_device_alloc[dev_id] = alloc allocations[device_name][slot_name] = per_device_alloc mounts = [Mount.from_str(m) for m in kvpairs['MOUNTS'].split(',') if m] return cls( container_id=kvpairs.get('CID', 'unknown'), scratch_disk_size=BinarySize.finite_from_str( kvpairs['SCRATCH_SIZE']), allocations=dict(allocations), slots=ResourceSlot(json.loads(kvpairs['SLOTS'])), mounts=mounts, )
async def get_image_slot_ranges(self, image_ref: ImageRef): ''' Returns the minimum and maximum ResourceSlot values. All slot values are converted and normalized to Decimal. ''' data = await self.etcd.get_prefix_dict(image_ref.tag_path) slot_units = await self.get_resource_slots() min_slot = ResourceSlot() max_slot = ResourceSlot() for slot_key, slot_range in data['resource'].items(): slot_unit = slot_units.get(slot_key) if slot_unit is None: # ignore unknown slots continue min_value = slot_range.get('min') if min_value is None: min_value = Decimal(0) max_value = slot_range.get('max') if max_value is None: max_value = Decimal('Infinity') if slot_unit == 'bytes': if not isinstance(min_value, Decimal): min_value = BinarySize.from_str(min_value) if not isinstance(max_value, Decimal): max_value = BinarySize.from_str(max_value) else: if not isinstance(min_value, Decimal): min_value = Decimal(min_value) if not isinstance(max_value, Decimal): max_value = Decimal(max_value) min_slot[slot_key] = min_value max_slot[slot_key] = max_value # fill missing for slot_key in slot_units.keys(): if slot_key not in min_slot: min_slot[slot_key] = Decimal(0) if slot_key not in max_slot: max_slot[slot_key] = Decimal('Infinity') return min_slot, max_slot
def parse_row(cls, context, row): assert row is not None from .user import UserRole mega = 2 ** 20 is_superadmin = (context['user']['role'] == UserRole.SUPERADMIN) if is_superadmin: hide_agents = False else: hide_agents = context['config']['manager']['hide-agents'] return { 'sess_id': row['sess_id'], 'id': row['id'], 'role': row['role'], 'image': row['image'], 'registry': row['registry'], 'domain_name': row['domain_name'], 'group_name': row['name'], # group.name (group is omitted since use_labels=True is not used) 'group_id': row['group_id'], 'scaling_group': row['scaling_group'], 'user_uuid': row['user_uuid'], 'access_key': row['access_key'], 'status': row['status'].name, 'status_info': row['status_info'], 'created_at': row['created_at'], 'terminated_at': row['terminated_at'], 'service_ports': row['service_ports'], 'occupied_slots': row['occupied_slots'].to_json(), 'occupied_shares': row['occupied_shares'], 'mounts': row['mounts'], 'num_queries': row['num_queries'], # optinally hidden 'agent': row['agent'] if not hide_agents else None, 'container_id': row['container_id'] if not hide_agents else None, # live_stat is resolved by Graphene 'last_stat': row['last_stat'], 'user_email': row['email'], # Legacy fields # NOTE: currently graphene always uses resolve methods! 'cpu_used': 0, 'mem_max_bytes': 0, 'mem_cur_bytes': 0, 'net_rx_bytes': 0, 'net_tx_bytes': 0, 'io_read_bytes': 0, 'io_write_bytes': 0, 'io_max_scratch_size': 0, 'io_cur_scratch_size': 0, 'lang': row['image'], 'mem_slot': BinarySize.from_str( row['occupied_slots'].get('mem', 0)) // mega, 'cpu_slot': float(row['occupied_slots'].get('cpu', 0)), 'gpu_slot': float(row['occupied_slots'].get('cuda.device', 0)), 'tpu_slot': float(row['occupied_slots'].get('tpu.device', 0)), }
async def get_quota(self, vfid: UUID) -> BinarySize: full_report = await run( ["sudo", "xfs_quota", "-x", "-c", "report -h", self.mount_path], ) for line in full_report.split("\n"): if str(vfid) in line: report = line break if len(report.split()) != 6: raise ExecutionError("unexpected format for xfs_quota report") proj_name, _, _, quota, _, _ = report.split() if not str(vfid).startswith(proj_name): raise ExecutionError("vfid and project name does not match") return BinarySize.finite_from_str(quota)
async def get_used_bytes(self, vfid: UUID) -> BinarySize: vfpath = self.mangle_vfpath(vfid) proc = await asyncio.create_subprocess_exec( b"pdu", b"-hs", bytes(vfpath), stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await proc.communicate() if proc.returncode != 0: raise RuntimeError(f"pdu command failed: {stderr.decode()}") used_bytes, _ = stdout.decode().split() return BinarySize.finite_from_str(used_bytes)
async def test_xfs_get_used_bytes(xfs): vfid = uuid.uuid4() options = {"quota": BinarySize.from_str("10m")} await xfs.create_vfolder(vfid, options=options) vfpath = xfs.mount_path / vfid.hex[0:2] / vfid.hex[2:4] / vfid.hex[4:] (vfpath / "test.txt").write_bytes(b"12345") (vfpath / "inner").mkdir() (vfpath / "inner" / "hello.txt").write_bytes(b"678") (vfpath / "inner" / "world.txt").write_bytes(b"901") used_bytes = await xfs.get_used_bytes(vfid) full_report = await run( ["sudo", "xfs_quota", "-x", "-c", "report -h", xfs.mount_path], ) report = "" for line in full_report.split("\n"): if str(vfid) in line: report = line break assert len(report.split()) == 6 proj_name, xfs_used, _, _, _, _ = report.split() assert str(vfid)[:-5] == proj_name assert used_bytes == BinarySize.from_str(xfs_used) await xfs.delete_vfolder(vfid) assert not vfpath.is_dir()
async def get_usage(self, vfid: UUID, relpath: PurePosixPath = PurePosixPath(".")): full_report = await run( ["sudo", "xfs_quota", "-x", "-c", "report -pbih", self.mount_path], ) report = "" for line in full_report.split("\n"): if str(vfid) in line: report = line break if len(report.split()) != 11: raise ExecutionError("unexpected format for xfs_quota report") proj_name, used_size, _, _, _, _, inode_used, _, _, _, _ = report.split() used_bytes = int(BinarySize.finite_from_str(used_size)) if not str(vfid).startswith(proj_name): raise ExecutionError("vfid and project name does not match") return VFolderUsage(file_count=int(inode_used), used_bytes=used_bytes)
async def delete_vfolder(self, vfid: UUID) -> None: async with FileLock(LOCK_FILE): await self.registry.read_project_info() if vfid in self.registry.name_id_map.keys(): try: log.info("removing project quota (f:{})", vfid) await self.set_quota(vfid, BinarySize(0)) except (asyncio.CancelledError, asyncio.TimeoutError) as e: log.exception("vfolder deletion timeout", exc_info=e) pass # Pass to delete the physical directlry anyway. except Exception as e: log.exception("vfolder deletion error", exc_info=e) pass # Pass to delete the physical directlry anyway. finally: await self.registry.remove_project_entry(vfid) await super().delete_vfolder(vfid) await self.registry.read_project_info()
async def test_xfs_multiple_vfolder_mgmt(xfs): vfid1 = uuid.UUID(hex="83a6ba2b7b8e41deb5ee2c909ce34bcb") vfid2 = uuid.UUID(hex="83a6ba2b7b8e41deb5ee2c909ce34bcc") options = {"quota": BinarySize.from_str("10m")} await xfs.create_vfolder(vfid1, options=options) await xfs.create_vfolder(vfid2, options=options) vfpath1 = xfs.mount_path / vfid1.hex[0:2] / vfid1.hex[2:4] / vfid1.hex[4:] vfpath2 = xfs.mount_path / vfid2.hex[0:2] / vfid2.hex[2:4] / vfid2.hex[4:] assert vfpath2.relative_to(vfpath1.parent).name == vfpath2.name assert vfpath1.is_dir() await xfs.delete_vfolder(vfid1) assert not vfpath1.exists() # if the prefix dirs are not empty, they shouldn't be deleted assert vfpath1.parent.exists() assert vfpath1.parent.parent.exists() await xfs.delete_vfolder(vfid2) # if the prefix dirs become empty, they should be deleted assert not vfpath2.parent.exists() assert not vfpath2.parent.parent.exists()
async def get_attached_devices( self, device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]], ) -> Sequence[DeviceModelInfo]: device_ids: List[DeviceId] = [] if SlotName('cuda.devices') in device_alloc: device_ids.extend(device_alloc[SlotName('cuda.devices')].keys()) available_devices = await self.list_devices() attached_devices: List[DeviceModelInfo] = [] for device in available_devices: if device.device_id in device_ids: proc = device.processing_units mem = BinarySize(device.memory_size) attached_devices.append({ # TODO: update common.types.DeviceModelInfo 'device_id': device.device_id, 'model_name': device.model_name, 'smp': proc, 'mem': mem, }) return attached_devices
async def test_xfs_single_vfolder_mgmt(xfs): vfid = uuid.uuid4() options = {"quota": BinarySize.from_str("10m")} # vfolder create test await xfs.create_vfolder(vfid, options=options) vfpath = xfs.mount_path / vfid.hex[0:2] / vfid.hex[2:4] / vfid.hex[4:] project_id_dict = read_etc_projid() vfpath_id_dict = read_etc_projects() assert vfpath.is_dir() assert str(vfid) in project_id_dict vfid_project_id = project_id_dict[str(vfid)] # vfolder delete test assert vfpath_id_dict[project_id_dict[str(vfid)]] == str(vfpath) await xfs.delete_vfolder(vfid) assert not vfpath.exists() assert not vfpath.parent.exists() or not (vfpath.parent / vfid.hex[2:4]).exists() assert (not vfpath.parent.parent.exists() or not (vfpath.parent.parent / vfid.hex[0:2]).exists()) project_id_dict = read_etc_projid() vfpath_id_dict = read_etc_projects() assert str(vfid) not in project_id_dict assert vfid_project_id not in vfpath_id_dict
async def empty_vfolder(xfs): vfid = uuid.uuid4() await xfs.create_vfolder(vfid, options={"quota": BinarySize.from_str("10m")}) yield vfid await xfs.delete_vfolder(vfid)
async def get_used_bytes(self, vfid: UUID) -> BinarySize: vfpath = self.mangle_vfpath(vfid) info = await run(["du", "-hs", vfpath]) used_bytes, _ = info.split() return BinarySize.finite_from_str(used_bytes)
def upgrade(): # ### commands auto generated by Alembic - please adjust! ### op.create_table( 'resource_presets', sa.Column('name', sa.String(length=256), nullable=False), sa.Column('resource_slots', ResourceSlotColumn(), nullable=False), sa.PrimaryKeyConstraint('name', name=op.f('pk_resource_presets'))) # Add initial fixtures for resource presets query = ''' INSERT INTO resource_presets VALUES ( 'small', '{"cpu":"1","mem":"2147483648"}'::jsonb ); INSERT INTO resource_presets VALUES ( 'small-gpu', '{"cpu":"1","mem":"2147483648","cuda.device":"1","cuda.shares":"0.5"}'::jsonb ); INSERT INTO resource_presets VALUES ( 'medium', '{"cpu":"2","mem":"4294967296"}'::jsonb ); INSERT INTO resource_presets VALUES ( 'medium-gpu', '{"cpu":"2","mem":"4294967296","cuda.device":"1","cuda.shares":"1.0"}'::jsonb ); INSERT INTO resource_presets VALUES ( 'large', '{"cpu":"4","mem":"8589934592"}'::jsonb ); INSERT INTO resource_presets VALUES ( 'large-gpu', '{"cpu":"4","mem":"8589934592","cuda.device":"2","cuda.shares":"2.0"}'::jsonb ); ''' connection = op.get_bind() connection.execute(query) query = ''' SELECT name, total_resource_slots FROM keypair_resource_policies ''' connection = op.get_bind() result = connection.execute(query) updates = [] for row in result: converted = ResourceSlot(row['total_resource_slots']) if 'mem' in converted: converted['mem'] = Decimal(BinarySize.from_str(converted['mem'])) updates.append(( row['name'], converted, )) for name, slots in updates: query = (sa.update(keypair_resource_policies).values( total_resource_slots=slots).where( keypair_resource_policies.c.name == name)) connection.execute(query)