示例#1
0
文件: DcgmGroup.py 项目: NVIDIA/DCGM
    def RemoveGpu(self, gpuId):
        if self._IsGroupIdStatic():
            raise pydcgm.DcgmException(
                "Can't remove a GPU from a static group")

        ret = dcgm_agent.dcgmGroupRemoveDevice(self._dcgmHandle.handle,
                                               self._groupId, gpuId)
        dcgm_structs._dcgmCheckReturn(ret)
示例#2
0
def dcgm_group_test_default_group(handle, gpuIds):
    """
    Test that the default group can not be deleted, or manipulated and is returning all GPUs.

    Note that we're not using groupObj for some tests because it protects against operations on the default group
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetDefaultGroup()

    gpuIdList = gpuIds
    assert len(gpuIdList) > 0, "Failed to get devices from the node"

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, 9999)

    groupGpuIdList = groupObj.GetGpuIds()
    assert (gpuIdList == groupGpuIdList
            ), "Expected gpuId list match %s != %s" % (str(gpuIdList),
                                                       str(groupGpuIdList))
    groupEntityList = groupObj.GetEntities()
    gpuIdList2 = []
    for entity in groupEntityList:
        assert entity.entityGroupId == dcgm_fields.DCGM_FE_GPU, str(
            entity.entityGroupId)
        gpuIdList2.append(entity.entityId)
    assert gpuIdList == gpuIdList2, "Expected gpuId list to match entity list: %s != %s" % (
        str(gpuIdList), str(gpuIdList2))

    for gpuId in gpuIdList:
        with test_utils.assert_raises(
                dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
            ret = dcgm_agent.dcgmGroupRemoveDevice(
                handle, dcgm_structs.DCGM_GROUP_ALL_GPUS, gpuId)
        with test_utils.assert_raises(pydcgm.DcgmException):
            groupObj.RemoveGpu(gpuId)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        ret = dcgm_agent.dcgmGroupDestroy(handle,
                                          dcgm_structs.DCGM_GROUP_ALL_GPUS)