示例#1
0
def helper_dcgm_verify_sync_boost_single_gpu(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")

    ## Add first GPU to the group
    groupObj.AddGpu(gpuIds[0])
    gpuIds = groupObj.GetGpuIds()  #Only reference GPUs we are testing against

    ## Set the sync boost for the group
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = 1
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Config Set must return DCGM_ST_BADPARAM since we only have a single GPU
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)):
        groupObj.config.Set(config_values)

    groupObj.Delete()
示例#2
0
def vtDcgmConfigSet(dcgm_handle, group_id, configToSet, status_handle,
                    versionTest):
    fn = dcgmFP("dcgmConfigSet")
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.version = dcgm_structs.make_dcgm_version(config_values, 1)
    logger.debug("Structure version: %d" % config_values.version)
    configToSet.version = versionTest
    ret = fn(dcgm_handle, group_id, byref(configToSet), status_handle)
    dcgm_structs._dcgmCheckReturn(ret)
    return ret
示例#3
0
def helper_dcgm_config_set(handle):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetDefaultGroup()

    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK

    #Will throw an exception on error
    groupObj.config.Set(config_values)
示例#4
0
def helper_test_config_config_power_enforce(handle, gpuIds):
    """
    Checks if DCGM can enforce the power settings if it's changed behind the scenes
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")
    ## Add first GPU to the group
    groupObj.AddGpu(gpuIds[0])
    gpuIds = groupObj.GetGpuIds()  #Only reference GPUs we are testing against
    gpuId = gpuIds[0]

    ## Get Min and Max Power limit on the group
    attributes = systemObj.discovery.GetGpuAttributes(gpuId)

    ## Verify that power is supported on the GPUs in the group
    if dcgmvalue.DCGM_INT32_IS_BLANK(attributes.powerLimits.maxPowerLimit):
        test_utils.skip_test("Needs Power limit to be supported on the GPU")

    powerLimit_set_dcgmi = int((attributes.powerLimits.maxPowerLimit +
                                attributes.powerLimits.minPowerLimit) / 2)
    powerLimit_set_nvsmi = attributes.powerLimits.maxPowerLimit

    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL
    config_values.mPowerLimit.val = powerLimit_set_dcgmi

    groupObj.config.Set(config_values)

    logger.info("Verify if dcgmi configured value has taken effect")
    helper_verify_power_value(groupObj, powerLimit_set_dcgmi)

    ## Change Power limit to max from external entity like nvidia-smi
    assert 0 == apps.NvidiaSmiApp(["-pl", str(powerLimit_set_nvsmi), "-i", str(gpuIds[0])]).run(), \
        "Nvidia smi couldn't set the power limit"

    logger.info("Verify if nvsmi configured value has taken effect")
    helper_verify_power_value(groupObj, powerLimit_set_nvsmi)

    groupObj.config.Enforce()

    logger.info("Verify if dcgmi enforced value has taken effect")
    helper_verify_power_value(groupObj, powerLimit_set_dcgmi)
示例#5
0
def helper_dcgm_config_powerbudget(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")
    ## Add first GPU to the group
    groupObj.AddGpu(gpuIds[0])
    gpuIds = groupObj.GetGpuIds()  #Only reference GPUs we are testing against

    ## Get Min and Max Power limit on the group
    attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, gpuIds[0])

    ## Verify that power is supported on the GPUs in the group
    if dcgmvalue.DCGM_INT32_IS_BLANK(attributes.powerLimits.maxPowerLimit):
        test_utils.skip_test("Needs Power limit to be supported on the GPU")

    powerLimit = int((attributes.powerLimits.maxPowerLimit +
                      attributes.powerLimits.minPowerLimit) / 2)

    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgm_structs.DCGM_CONFIG_POWER_BUDGET_GROUP
    config_values.mPowerLimit.val = powerLimit * len(
        gpuIds)  #Assumes homogenous GPUs

    groupObj.config.Set(config_values)

    config_values = groupObj.config.Get(dcgm_structs.DCGM_CONFIG_CURRENT_STATE)
    assert len(config_values
               ) > 0, "Failed to get configuration using groupObj.config.Get"

    for x in range(0, len(gpuIds)):
        if (config_values[x].mPowerLimit.val !=
                dcgmvalue.DCGM_INT32_NOT_SUPPORTED):
            assert config_values[
                x].mPowerLimit.type == dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL, "The power limit type for gpuId %d is incorrect. Returned: %d Expected :%d" % (
                    x, config_values[x].mPowerLimit.type,
                    dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL)
            assert config_values[
                x].mPowerLimit.val == powerLimit, "The power limit value for gpuID %d is incorrect. Returned: %d Expected: %s" % (
                    x, config_values[x].mPowerLimit.val, powerLimit)
        pass
示例#6
0
def helper_dcgm_config_get(handle):

    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetDefaultGroup()

    ## Set the configuration first
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK

    #Will throw exception on error
    groupObj.config.Set(config_values)

    ## Get the target configuration to make sure that it's exact same as the one configured
    config_values = groupObj.config.Get(dcgm_structs.DCGM_CONFIG_TARGET_STATE)

    gpuIds = groupObj.GetGpuIds()

    ## Loop through config_values to to check for correctness of values fetched from the hostengine
    for x in range(0, len(gpuIds)):
        assert config_values[
            x].mEccMode == dcgmvalue.DCGM_INT32_BLANK, "Failed to get matching value for ecc mode. Expected: %d Received: %d" % (
                dcgmvalue.DCGM_INT32_BLANK, config_values[x].mEccMode)
        assert config_values[
            x].mPerfState.targetClocks.memClock == dcgmvalue.DCGM_INT32_BLANK, "Failed to get matching value for mem app clk. Expected: %d Received: %d" % (
                dcgmvalue.DCGM_INT32_BLANK,
                config_values[x].mPerfState.targetClocks.memClock)
        assert config_values[
            x].mPerfState.targetClocks.smClock == dcgmvalue.DCGM_INT32_BLANK, "Failed to get matching value for proc app clk. Expected: %d Received: %d" % (
                dcgmvalue.DCGM_INT32_BLANK,
                config_values[x].mPerfState.targetClocks.smClock)
        assert config_values[
            x].mPowerLimit.val == dcgmvalue.DCGM_INT32_BLANK, "Failed to get matching value for power limit. Expected: %d Received: %d" % (
                dcgmvalue.DCGM_INT32_BLANK, config_values[x].mPowerLimit.val)
        assert config_values[
            x].mComputeMode == dcgmvalue.DCGM_INT32_BLANK, "Failed to get matching value for power limit. Expected: %d Received: %d" % (
                dcgmvalue.DCGM_INT32_BLANK, config_values[x].mComputeMode)
        pass
示例#7
0
def helper_dcgm_verify_sync_boost_multi_gpu(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")

    if len(gpuIds) < 2:
        test_utils.skip_test(
            "This test only works with 2 or more identical GPUs")

    ## Add all identical GPUs to the group
    for gpuId in gpuIds:
        groupObj.AddGpu(gpuId)

    gpuIds = groupObj.GetGpuIds()  #Only reference GPUs we are testing against

    ## Set the sync boost for the group
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = 1
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Enable sync boost - Will throw an exception on error
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_NOT_SUPPORTED)):
        groupObj.config.Set(config_values)

    config_values.mPerfState.syncBoost = 0

    #Disable sync boost - Will throw an exception on error
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_NOT_SUPPORTED)):
        groupObj.config.Set(config_values)

    groupObj.Delete()
示例#8
0
def test_dcgm_vgpu_config_set_validate(handle):
    """
    Validates structure version
    """

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")
    status_handle = dcgm_agent.dcgmStatusCreate()
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmVgpuConfigSet(handle, groupId, config_values,
                                  status_handle, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random invalid version
        ret = vtDcgmVgpuConfigSet(handle, groupId, config_values,
                                  status_handle, versionTest)
示例#9
0
def test_dcgm_default_status_handler(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")
    ## Add first GPU to the group
    groupObj.AddGpu(gpuIds[0])
    gpuIds = groupObj.GetGpuIds()  #Only reference GPUs we are testing against

    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    groupObj.config.Set(config_values)

    config_values = groupObj.config.Get(dcgm_structs.DCGM_CONFIG_CURRENT_STATE)
    assert len(config_values) > 0, "Failed to work with NULL status handle"

    groupObj.config.Enforce()
示例#10
0
                                        dcgm_structs.c_dcgmGroupInfo_version2)
config_values = dcgm_agent.dcgmConfigGet(
    handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count,
    0)
assert len(config_values) > 0, "Failed to work with NULL status handle"
eccmodeOnGroupExisting = config_values[0].mEccMode

if eccmodeOnGroupExisting == 0:
    ecc_set = 1
else:
    ecc_set = 0

syncboost_set = 1
compute_set = dcgm_structs.DCGM_CONFIG_COMPUTEMODE_DEFAULT

config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
config_values.mEccMode = ecc_set
config_values.mPerfState.syncBoost = syncboost_set
config_values.mPerfState.autoBoost = autoBoost_set
config_values.mPerfState.minVPState.memClk = mem_clk_set
config_values.mPerfState.minVPState.procClk = proc_clk_set
config_values.mPerfState.maxVPState.memClk = mem_clk_set
config_values.mPerfState.maxVPState.procClk = proc_clk_set
config_values.mComputeMode = compute_set
config_values.mPowerLimit.type = dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL
config_values.mPowerLimit.val = powerLimit_set

## Set Config and verify the value
status_handle = dcgm_agent.dcgmStatusCreate()
ret = dcgm_agent.dcgmConfigSet(handle, groupId, config_values, statusHandle)
errors = helper_get_status_list(status_handle)
示例#11
0
def test_dcgm_configure_ecc_mode(handle, gpuIds):
    test_utils.skip_test("Skipping this test until bug 200377294 is fixed")

    groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                         "test1")

    validDevice = -1
    for x in gpuIds:
        fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
            handle, x, [
                dcgm_fields.DCGM_FI_DEV_ECC_CURRENT,
            ])
        if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
            validDevice = x
            break

    if (validDevice == -1):
        test_utils.skip_test(
            "Can only run if at least one GPU with ECC is present")

    ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice)
    assert (ret == dcgm_structs.DCGM_ST_OK
            ), "Failed to add a device to the group %d. Return %d" % (
                groupId.value, ret)

    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)

    #Create a status handle
    status_handle = dcgm_agent.dcgmStatusCreate()

    ## Get original ECC mode on the device
    config_values = dcgm_agent.dcgmConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        config_values) > 0, "Failed to get configuration using dcgmConfigGet"

    eccmodeOnGroupExisting = config_values[0].mEccMode
    if eccmodeOnGroupExisting == 0:
        eccmodeOnGroupToSet = 1
    else:
        eccmodeOnGroupToSet = 0

    #print eccmodeOnGroupExisting
    #print eccmodeOnGroupToSet

    ## Toggle the ECC mode on the group
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = eccmodeOnGroupToSet
    config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    try:
        ret = dcgm_agent.dcgmConfigSet(handle, groupId, config_values,
                                       status_handle)
    except dcgm_structs.DCGMError as e:
        pass

    errors = helper_get_status_list(status_handle)

    if len(errors) > 0:
        for error in errors:
            if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED:
                test_utils.skip_test(
                    "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d"
                    % (error.fieldId, error.status))
            else:
                test_utils.skip_test(
                    "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d"
                    % (error.fieldId, error.status))

    #Sleep after reset
    time.sleep(2)

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    #Get the current configuration
    config_values = dcgm_agent.dcgmConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        config_values) > 0, "Failed to get configuration using dcgmConfigGet"

    fvs = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, validDevice, [
            dcgm_fields.DCGM_FI_DEV_ECC_PENDING,
            dcgm_fields.DCGM_FI_DEV_ECC_CURRENT
        ])
    if fvs[0].value.i64 != fvs[1].value.i64:
        logger.warning(
            "Pending ECC %d != Current ECC %d for gpuId %d. Box probably needs a reboot"
            % (fvs[0].value.i64, fvs[1].value.i64, validDevice))
    else:
        assert config_values[0].mEccMode == (eccmodeOnGroupToSet), "ECC mode %d different from the set value %d" % \
                                                                   (config_values[0].mEccMode, eccmodeOnGroupToSet)