Пример #1
0
 def from_row(cls, context, row):
     if row is None:
         return None
     mega = 2**20
     return cls(
         id=row['id'],
         status=row['status'].name,
         region=row['region'],
         scaling_group=row['scaling_group'],
         available_slots=row['available_slots'].to_json(),
         occupied_slots=row['occupied_slots'].to_json(),
         addr=row['addr'],
         first_contact=row['first_contact'],
         lost_at=row['lost_at'],
         version=row['version'],
         compute_plugins=row['compute_plugins'],
         # legacy fields
         mem_slots=BinarySize.from_str(row['available_slots']['mem']) //
         mega,
         cpu_slots=row['available_slots']['cpu'],
         gpu_slots=row['available_slots'].get('cuda.device', 0),
         tpu_slots=row['available_slots'].get('tpu.device', 0),
         used_mem_slots=BinarySize.from_str(row['occupied_slots'].get(
             'mem', 0)) // mega,
         used_cpu_slots=float(row['occupied_slots'].get('cpu', 0)),
         used_gpu_slots=float(row['occupied_slots'].get('cuda.device', 0)),
         used_tpu_slots=float(row['occupied_slots'].get('tpu.device', 0)),
     )
Пример #2
0
 def from_row(
     cls,
     context: Mapping[str, Any],
     row: RowProxy,
 ) -> Agent:
     mega = 2 ** 20
     return cls(
         id=row['id'],
         status=row['status'].name,
         status_changed=row['status_changed'],
         region=row['region'],
         scaling_group=row['scaling_group'],
         schedulable=row['schedulable'],
         available_slots=row['available_slots'].to_json(),
         occupied_slots=row['occupied_slots'].to_json(),
         addr=row['addr'],
         first_contact=row['first_contact'],
         lost_at=row['lost_at'],
         version=row['version'],
         compute_plugins=row['compute_plugins'],
         # legacy fields
         mem_slots=BinarySize.from_str(row['available_slots']['mem']) // mega,
         cpu_slots=row['available_slots']['cpu'],
         gpu_slots=row['available_slots'].get('cuda.device', 0),
         tpu_slots=row['available_slots'].get('tpu.device', 0),
         used_mem_slots=BinarySize.from_str(
             row['occupied_slots'].get('mem', 0)) // mega,
         used_cpu_slots=float(row['occupied_slots'].get('cpu', 0)),
         used_gpu_slots=float(row['occupied_slots'].get('cuda.device', 0)),
         used_tpu_slots=float(row['occupied_slots'].get('tpu.device', 0)),
     )
Пример #3
0
 async def get_fs_usage(self) -> FSUsage:
     loop = asyncio.get_running_loop()
     stat = await loop.run_in_executor(None, os.statvfs, self.mount_path)
     return FSUsage(
         capacity_bytes=BinarySize(stat.f_frsize * stat.f_blocks),
         used_bytes=BinarySize(stat.f_frsize *
                               (stat.f_blocks - stat.f_bavail)),
     )
Пример #4
0
async def test_xfs_quota(xfs):
    vfid = uuid.uuid4()
    options = {"quota": BinarySize.from_str("10m")}
    await xfs.create_vfolder(vfid, options=options)
    vfpath = xfs.mount_path / vfid.hex[0:2] / vfid.hex[2:4] / vfid.hex[4:]
    assert vfpath.is_dir()
    assert await xfs.get_quota(vfid) == BinarySize.from_str("10m")
    await xfs.set_quota(vfid, BinarySize.from_str("1m"))
    assert await xfs.get_quota(vfid) == BinarySize.from_str("1m")
    await xfs.delete_vfolder(vfid)
    assert not vfpath.is_dir()
Пример #5
0
 def process_result_value(self, raw_value: Dict[str, str], dialect):
     # legacy handling
     interim_value: Dict[str, Any] = raw_value
     mem = raw_value.get('mem')
     if isinstance(mem, str) and not mem.isdigit():
         interim_value['mem'] = BinarySize.from_str(mem)
     return ResourceSlot.from_json(
         interim_value) if raw_value is not None else None
Пример #6
0
 def read_from_string(cls, text: str) -> 'KernelResourceSpec':
     kvpairs = {}
     for line in text.split('\n'):
         if '=' not in line:
             continue
         key, val = line.strip().split('=', maxsplit=1)
         kvpairs[key] = val
     allocations = cast(
         MutableMapping[DeviceName, MutableMapping[SlotName,
                                                   Mapping[DeviceId,
                                                           Decimal]]],
         defaultdict(lambda: defaultdict(Decimal)),
     )
     for key, val in kvpairs.items():
         if key.endswith('_SHARES'):
             slot_name = SlotName(key[:-7].lower())
             device_name = DeviceName(slot_name.split('.')[0])
             per_device_alloc: MutableMapping[DeviceId, Decimal] = {}
             for entry in val.split(','):
                 raw_dev_id, _, raw_alloc = entry.partition(':')
                 if not raw_dev_id or not raw_alloc:
                     continue
                 dev_id = DeviceId(raw_dev_id)
                 try:
                     if known_slot_types.get(slot_name, 'count') == 'bytes':
                         alloc = Decimal(BinarySize.from_str(raw_alloc))
                     else:
                         alloc = Decimal(raw_alloc)
                 except KeyError as e:
                     log.warning(
                         'A previously launched container has '
                         'unknown slot type: {}. Ignoring it.', e.args[0])
                     continue
                 per_device_alloc[dev_id] = alloc
             allocations[device_name][slot_name] = per_device_alloc
     mounts = [Mount.from_str(m) for m in kvpairs['MOUNTS'].split(',') if m]
     return cls(
         container_id=kvpairs.get('CID', 'unknown'),
         scratch_disk_size=BinarySize.finite_from_str(
             kvpairs['SCRATCH_SIZE']),
         allocations=dict(allocations),
         slots=ResourceSlot(json.loads(kvpairs['SLOTS'])),
         mounts=mounts,
     )
Пример #7
0
    async def get_image_slot_ranges(self, image_ref: ImageRef):
        '''
        Returns the minimum and maximum ResourceSlot values.
        All slot values are converted and normalized to Decimal.
        '''
        data = await self.etcd.get_prefix_dict(image_ref.tag_path)
        slot_units = await self.get_resource_slots()
        min_slot = ResourceSlot()
        max_slot = ResourceSlot()

        for slot_key, slot_range in data['resource'].items():
            slot_unit = slot_units.get(slot_key)
            if slot_unit is None:
                # ignore unknown slots
                continue
            min_value = slot_range.get('min')
            if min_value is None:
                min_value = Decimal(0)
            max_value = slot_range.get('max')
            if max_value is None:
                max_value = Decimal('Infinity')
            if slot_unit == 'bytes':
                if not isinstance(min_value, Decimal):
                    min_value = BinarySize.from_str(min_value)
                if not isinstance(max_value, Decimal):
                    max_value = BinarySize.from_str(max_value)
            else:
                if not isinstance(min_value, Decimal):
                    min_value = Decimal(min_value)
                if not isinstance(max_value, Decimal):
                    max_value = Decimal(max_value)
            min_slot[slot_key] = min_value
            max_slot[slot_key] = max_value

        # fill missing
        for slot_key in slot_units.keys():
            if slot_key not in min_slot:
                min_slot[slot_key] = Decimal(0)
            if slot_key not in max_slot:
                max_slot[slot_key] = Decimal('Infinity')

        return min_slot, max_slot
Пример #8
0
 def parse_row(cls, context, row):
     assert row is not None
     from .user import UserRole
     mega = 2 ** 20
     is_superadmin = (context['user']['role'] == UserRole.SUPERADMIN)
     if is_superadmin:
         hide_agents = False
     else:
         hide_agents = context['config']['manager']['hide-agents']
     return {
         'sess_id': row['sess_id'],
         'id': row['id'],
         'role': row['role'],
         'image': row['image'],
         'registry': row['registry'],
         'domain_name': row['domain_name'],
         'group_name': row['name'],  # group.name (group is omitted since use_labels=True is not used)
         'group_id': row['group_id'],
         'scaling_group': row['scaling_group'],
         'user_uuid': row['user_uuid'],
         'access_key': row['access_key'],
         'status': row['status'].name,
         'status_info': row['status_info'],
         'created_at': row['created_at'],
         'terminated_at': row['terminated_at'],
         'service_ports': row['service_ports'],
         'occupied_slots': row['occupied_slots'].to_json(),
         'occupied_shares': row['occupied_shares'],
         'mounts': row['mounts'],
         'num_queries': row['num_queries'],
         # optinally hidden
         'agent': row['agent'] if not hide_agents else None,
         'container_id': row['container_id'] if not hide_agents else None,
         # live_stat is resolved by Graphene
         'last_stat': row['last_stat'],
         'user_email': row['email'],
         # Legacy fields
         # NOTE: currently graphene always uses resolve methods!
         'cpu_used': 0,
         'mem_max_bytes': 0,
         'mem_cur_bytes': 0,
         'net_rx_bytes': 0,
         'net_tx_bytes': 0,
         'io_read_bytes': 0,
         'io_write_bytes': 0,
         'io_max_scratch_size': 0,
         'io_cur_scratch_size': 0,
         'lang': row['image'],
         'mem_slot': BinarySize.from_str(
             row['occupied_slots'].get('mem', 0)) // mega,
         'cpu_slot': float(row['occupied_slots'].get('cpu', 0)),
         'gpu_slot': float(row['occupied_slots'].get('cuda.device', 0)),
         'tpu_slot': float(row['occupied_slots'].get('tpu.device', 0)),
     }
Пример #9
0
 async def get_quota(self, vfid: UUID) -> BinarySize:
     full_report = await run(
         ["sudo", "xfs_quota", "-x", "-c", "report -h", self.mount_path],
     )
     for line in full_report.split("\n"):
         if str(vfid) in line:
             report = line
             break
     if len(report.split()) != 6:
         raise ExecutionError("unexpected format for xfs_quota report")
     proj_name, _, _, quota, _, _ = report.split()
     if not str(vfid).startswith(proj_name):
         raise ExecutionError("vfid and project name does not match")
     return BinarySize.finite_from_str(quota)
Пример #10
0
 async def get_used_bytes(self, vfid: UUID) -> BinarySize:
     vfpath = self.mangle_vfpath(vfid)
     proc = await asyncio.create_subprocess_exec(
         b"pdu",
         b"-hs",
         bytes(vfpath),
         stdout=asyncio.subprocess.PIPE,
         stderr=asyncio.subprocess.PIPE,
     )
     stdout, stderr = await proc.communicate()
     if proc.returncode != 0:
         raise RuntimeError(f"pdu command failed: {stderr.decode()}")
     used_bytes, _ = stdout.decode().split()
     return BinarySize.finite_from_str(used_bytes)
Пример #11
0
async def test_xfs_get_used_bytes(xfs):
    vfid = uuid.uuid4()
    options = {"quota": BinarySize.from_str("10m")}
    await xfs.create_vfolder(vfid, options=options)
    vfpath = xfs.mount_path / vfid.hex[0:2] / vfid.hex[2:4] / vfid.hex[4:]
    (vfpath / "test.txt").write_bytes(b"12345")
    (vfpath / "inner").mkdir()
    (vfpath / "inner" / "hello.txt").write_bytes(b"678")
    (vfpath / "inner" / "world.txt").write_bytes(b"901")

    used_bytes = await xfs.get_used_bytes(vfid)
    full_report = await run(
        ["sudo", "xfs_quota", "-x", "-c", "report -h", xfs.mount_path], )
    report = ""
    for line in full_report.split("\n"):
        if str(vfid) in line:
            report = line
            break
    assert len(report.split()) == 6
    proj_name, xfs_used, _, _, _, _ = report.split()
    assert str(vfid)[:-5] == proj_name
    assert used_bytes == BinarySize.from_str(xfs_used)
    await xfs.delete_vfolder(vfid)
    assert not vfpath.is_dir()
Пример #12
0
 async def get_usage(self, vfid: UUID, relpath: PurePosixPath = PurePosixPath(".")):
     full_report = await run(
         ["sudo", "xfs_quota", "-x", "-c", "report -pbih", self.mount_path],
     )
     report = ""
     for line in full_report.split("\n"):
         if str(vfid) in line:
             report = line
             break
     if len(report.split()) != 11:
         raise ExecutionError("unexpected format for xfs_quota report")
     proj_name, used_size, _, _, _, _, inode_used, _, _, _, _ = report.split()
     used_bytes = int(BinarySize.finite_from_str(used_size))
     if not str(vfid).startswith(proj_name):
         raise ExecutionError("vfid and project name does not match")
     return VFolderUsage(file_count=int(inode_used), used_bytes=used_bytes)
Пример #13
0
 async def delete_vfolder(self, vfid: UUID) -> None:
     async with FileLock(LOCK_FILE):
         await self.registry.read_project_info()
         if vfid in self.registry.name_id_map.keys():
             try:
                 log.info("removing project quota (f:{})", vfid)
                 await self.set_quota(vfid, BinarySize(0))
             except (asyncio.CancelledError, asyncio.TimeoutError) as e:
                 log.exception("vfolder deletion timeout", exc_info=e)
                 pass  # Pass to delete the physical directlry anyway.
             except Exception as e:
                 log.exception("vfolder deletion error", exc_info=e)
                 pass  # Pass to delete the physical directlry anyway.
             finally:
                 await self.registry.remove_project_entry(vfid)
         await super().delete_vfolder(vfid)
         await self.registry.read_project_info()
Пример #14
0
async def test_xfs_multiple_vfolder_mgmt(xfs):
    vfid1 = uuid.UUID(hex="83a6ba2b7b8e41deb5ee2c909ce34bcb")
    vfid2 = uuid.UUID(hex="83a6ba2b7b8e41deb5ee2c909ce34bcc")
    options = {"quota": BinarySize.from_str("10m")}
    await xfs.create_vfolder(vfid1, options=options)
    await xfs.create_vfolder(vfid2, options=options)
    vfpath1 = xfs.mount_path / vfid1.hex[0:2] / vfid1.hex[2:4] / vfid1.hex[4:]
    vfpath2 = xfs.mount_path / vfid2.hex[0:2] / vfid2.hex[2:4] / vfid2.hex[4:]
    assert vfpath2.relative_to(vfpath1.parent).name == vfpath2.name
    assert vfpath1.is_dir()
    await xfs.delete_vfolder(vfid1)
    assert not vfpath1.exists()
    # if the prefix dirs are not empty, they shouldn't be deleted
    assert vfpath1.parent.exists()
    assert vfpath1.parent.parent.exists()
    await xfs.delete_vfolder(vfid2)
    # if the prefix dirs become empty, they should be deleted
    assert not vfpath2.parent.exists()
    assert not vfpath2.parent.parent.exists()
Пример #15
0
 async def get_attached_devices(
     self,
     device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]],
 ) -> Sequence[DeviceModelInfo]:
     device_ids: List[DeviceId] = []
     if SlotName('cuda.devices') in device_alloc:
         device_ids.extend(device_alloc[SlotName('cuda.devices')].keys())
     available_devices = await self.list_devices()
     attached_devices: List[DeviceModelInfo] = []
     for device in available_devices:
         if device.device_id in device_ids:
             proc = device.processing_units
             mem = BinarySize(device.memory_size)
             attached_devices.append({  # TODO: update common.types.DeviceModelInfo
                 'device_id': device.device_id,
                 'model_name': device.model_name,
                 'smp': proc,
                 'mem': mem,
             })
     return attached_devices
Пример #16
0
async def test_xfs_single_vfolder_mgmt(xfs):
    vfid = uuid.uuid4()
    options = {"quota": BinarySize.from_str("10m")}
    # vfolder create test
    await xfs.create_vfolder(vfid, options=options)
    vfpath = xfs.mount_path / vfid.hex[0:2] / vfid.hex[2:4] / vfid.hex[4:]
    project_id_dict = read_etc_projid()
    vfpath_id_dict = read_etc_projects()
    assert vfpath.is_dir()
    assert str(vfid) in project_id_dict
    vfid_project_id = project_id_dict[str(vfid)]
    # vfolder delete test
    assert vfpath_id_dict[project_id_dict[str(vfid)]] == str(vfpath)
    await xfs.delete_vfolder(vfid)
    assert not vfpath.exists()
    assert not vfpath.parent.exists() or not (vfpath.parent /
                                              vfid.hex[2:4]).exists()
    assert (not vfpath.parent.parent.exists()
            or not (vfpath.parent.parent / vfid.hex[0:2]).exists())
    project_id_dict = read_etc_projid()
    vfpath_id_dict = read_etc_projects()
    assert str(vfid) not in project_id_dict
    assert vfid_project_id not in vfpath_id_dict
Пример #17
0
async def empty_vfolder(xfs):
    vfid = uuid.uuid4()
    await xfs.create_vfolder(vfid,
                             options={"quota": BinarySize.from_str("10m")})
    yield vfid
    await xfs.delete_vfolder(vfid)
Пример #18
0
 async def get_used_bytes(self, vfid: UUID) -> BinarySize:
     vfpath = self.mangle_vfpath(vfid)
     info = await run(["du", "-hs", vfpath])
     used_bytes, _ = info.split()
     return BinarySize.finite_from_str(used_bytes)
def upgrade():
    # ### commands auto generated by Alembic - please adjust! ###
    op.create_table(
        'resource_presets',
        sa.Column('name', sa.String(length=256), nullable=False),
        sa.Column('resource_slots', ResourceSlotColumn(), nullable=False),
        sa.PrimaryKeyConstraint('name', name=op.f('pk_resource_presets')))
    # Add initial fixtures for resource presets
    query = '''
    INSERT INTO resource_presets
    VALUES (
        'small',
        '{"cpu":"1","mem":"2147483648"}'::jsonb
    );
    INSERT INTO resource_presets
    VALUES (
        'small-gpu',
        '{"cpu":"1","mem":"2147483648","cuda.device":"1","cuda.shares":"0.5"}'::jsonb
    );
    INSERT INTO resource_presets
    VALUES (
        'medium',
        '{"cpu":"2","mem":"4294967296"}'::jsonb
    );
    INSERT INTO resource_presets
    VALUES (
        'medium-gpu',
        '{"cpu":"2","mem":"4294967296","cuda.device":"1","cuda.shares":"1.0"}'::jsonb
    );
    INSERT INTO resource_presets
    VALUES (
        'large',
        '{"cpu":"4","mem":"8589934592"}'::jsonb
    );
    INSERT INTO resource_presets
    VALUES (
        'large-gpu',
        '{"cpu":"4","mem":"8589934592","cuda.device":"2","cuda.shares":"2.0"}'::jsonb
    );
    '''
    connection = op.get_bind()
    connection.execute(query)

    query = '''
    SELECT name, total_resource_slots
    FROM keypair_resource_policies
    '''
    connection = op.get_bind()
    result = connection.execute(query)
    updates = []
    for row in result:
        converted = ResourceSlot(row['total_resource_slots'])
        if 'mem' in converted:
            converted['mem'] = Decimal(BinarySize.from_str(converted['mem']))
            updates.append((
                row['name'],
                converted,
            ))
    for name, slots in updates:
        query = (sa.update(keypair_resource_policies).values(
            total_resource_slots=slots).where(
                keypair_resource_policies.c.name == name))
        connection.execute(query)