Пример #1
0
def agent_worker_function(handle, groupId, groupInfo, status_handle):
    NUM_ITERATIONS = 5
    count = 0    

    while True:
        dcgm_agent.dcgmUpdateAllFields(handle, 1)

        ## Get the current configuration for the group
        config_values = dcgm_agent.dcgmConfigGet(handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle)
    
        ## Since this is a group operation, Check for the status codes if any of the property failed    
        helper_investigate_status(status_handle)
        dcgm_agent.dcgmStatusClear(status_handle)
        
        ## Display current configuration for the group
        for x in xrange(0,groupInfo.count):
            print "GPU Id      : %d" % (config_values[x].gpuId)
            print "Ecc  Mode   : %s" % (convert_value_to_string(config_values[x].mEccMode))
            print "Auto Boost  : %s" % (convert_value_to_string(config_values[x].mPerfState.autoBoost))
            print "Sync Boost  : %s" % (convert_value_to_string(config_values[x].mPerfState.autoBoost))
            print "Mem Clock   : %s" % (convert_value_to_string(config_values[x].mPerfState.minVPState.memClk))
            print "SM  Clock   : %s" % (convert_value_to_string(config_values[x].mPerfState.minVPState.procClk))
            print "Power Limit : %s" % (convert_value_to_string(config_values[x].mPowerLimit.val))
            print "Compute Mode: %s" % (convert_value_to_string(config_values[x].mComputeMode))
            print "\n"
        
        count = count + 1
        
        if count == NUM_ITERATIONS:
            break

        sleep(2)
Пример #2
0
def test_dcgm_configure_ecc_mode(handle, gpuIds):
    test_utils.skip_test("Skipping this test until bug 200377294 is fixed")

    groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                         "test1")

    validDevice = -1
    for x in gpuIds:
        fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
            handle, x, [
                dcgm_fields.DCGM_FI_DEV_ECC_CURRENT,
            ])
        if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
            validDevice = x
            break

    if (validDevice == -1):
        test_utils.skip_test(
            "Can only run if at least one GPU with ECC is present")

    ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice)
    assert (ret == dcgm_structs.DCGM_ST_OK
            ), "Failed to add a device to the group %d. Return %d" % (
                groupId.value, ret)

    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)

    #Create a status handle
    status_handle = dcgm_agent.dcgmStatusCreate()

    ## Get original ECC mode on the device
    config_values = dcgm_agent.dcgmConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        config_values) > 0, "Failed to get configuration using dcgmConfigGet"

    eccmodeOnGroupExisting = config_values[0].mEccMode
    if eccmodeOnGroupExisting == 0:
        eccmodeOnGroupToSet = 1
    else:
        eccmodeOnGroupToSet = 0

    #print eccmodeOnGroupExisting
    #print eccmodeOnGroupToSet

    ## Toggle the ECC mode on the group
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = eccmodeOnGroupToSet
    config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    try:
        ret = dcgm_agent.dcgmConfigSet(handle, groupId, config_values,
                                       status_handle)
    except dcgm_structs.DCGMError as e:
        pass

    errors = helper_get_status_list(status_handle)

    if len(errors) > 0:
        for error in errors:
            if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED:
                test_utils.skip_test(
                    "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d"
                    % (error.fieldId, error.status))
            else:
                test_utils.skip_test(
                    "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d"
                    % (error.fieldId, error.status))

    #Sleep after reset
    time.sleep(2)

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    #Get the current configuration
    config_values = dcgm_agent.dcgmConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        config_values) > 0, "Failed to get configuration using dcgmConfigGet"

    fvs = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, validDevice, [
            dcgm_fields.DCGM_FI_DEV_ECC_PENDING,
            dcgm_fields.DCGM_FI_DEV_ECC_CURRENT
        ])
    if fvs[0].value.i64 != fvs[1].value.i64:
        logger.warning(
            "Pending ECC %d != Current ECC %d for gpuId %d. Box probably needs a reboot"
            % (fvs[0].value.i64, fvs[1].value.i64, validDevice))
    else:
        assert config_values[0].mEccMode == (eccmodeOnGroupToSet), "ECC mode %d different from the set value %d" % \
                                                                   (config_values[0].mEccMode, eccmodeOnGroupToSet)
Пример #3
0
config_values.mPowerLimit.val = powerLimit_set

## Set Config and verify the value
status_handle = dcgm_agent.dcgmStatusCreate()
ret = dcgm_agent.dcgmConfigSet(handle, groupId, config_values, statusHandle)
errors = helper_get_status_list(status_handle)

ecc_to_verify = ecc_set
if len(errors) > 0:
    ## Possible that reset failed. Check the error codes
    for error in errors:
        if error.fieldId == dcgm_fields.DCGM_FI_DEV_ECC_CURRENT:
            ecc_to_verify = eccmodeOnGroupExisting

#assert(ret == dcgm_structs.DCGM_ST_OK), "Failed to set configuration for the group: %s" % ret
dcgm_agent.dcgmStatusClear(statusHandle)
helper_verify_config_values_standalone(handle, groupId, powerLimit_set,
                                       ecc_to_verify, proc_clk_set,
                                       mem_clk_set, compute_set, syncboost_set,
                                       autoBoost_set)

print "Verification Successful"

ret = dcgm_agent.dcgmGroupDestroy(handle, groupId)
assert (ret == dcgm_structs.DCGM_ST_OK
        ), "Failed to remove the test group, error: %s" % ret

ret = dcgm_agent.dcgmStatusDestroy(statusHandle)
assert (ret == dcgm_structs.DCGM_ST_OK
        ), "Failed to remove status handler, error: %s" % ret
Пример #4
0
def test_dcgm_vgpu_configure_ecc_mode(handle, gpuIds):
    test_utils.skip_test("Skipping this test until bug 200377294 is fixed")

    groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                         "test1")

    validDevice = -1
    for x in gpuIds:
        fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
            handle, x, [dcgm_fields.DCGM_FI_DEV_RETIRED_DBE])
        if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
            validDevice = x
        break

    if (validDevice == -1):
        test_utils.skip_test(
            "Can only run if at least one GPU with ECC is present")

    ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice)
    assert (
        ret == dcgm_structs.DCGM_ST_OK
    ), "Failed to add a device to the group %d. Return %d" % (groupId, ret)

    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)

    #Create a status handle
    status_handle = dcgm_agent.dcgmStatusCreate()

    ## Get original ECC mode on the device
    vgpu_config_values = dcgm_agent_internal.dcgmVgpuConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        vgpu_config_values) > 0, "Failed to work with NULL status handle"

    eccmodeOnGroupExisting = vgpu_config_values[0].mEccMode
    if eccmodeOnGroupExisting == 0:
        eccmodeOnGroupToSet = 1
    else:
        eccmodeOnGroupToSet = 0

    #print eccmodeOnGroupExisting
    #print eccmodeOnGroupToSet

    ## Toggle the ECC mode on the group
    vgpu_config_values = dcgm_structs.c_dcgmDeviceVgpuConfig_v1()
    vgpu_config_values.mEccMode = eccmodeOnGroupToSet
    vgpu_config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    vgpu_config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    vgpu_config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    try:
        ret = dcgm_agent_internal.dcgmVgpuConfigSet(handle, groupId,
                                                    vgpu_config_values,
                                                    status_handle)
    except dcgm_structs.DCGMError as e:
        pass

    errors = helper_get_status_list(status_handle)

    if len(errors) > 0:
        for error in errors:
            if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED:
                test_utils.skip_test(
                    "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d"
                    % (error.fieldId, error.status))
            else:
                test_utils.skip_test(
                    "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d"
                    % (error.fieldId, error.status))

    #Sleep after reset and then apply update for it to occur
    time.sleep(2)

    dcgm_agent.dcgmUpdateAllFields(handle, 1)

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    #Get the current configuration
    config_values = dcgm_agent_internal.dcgmVgpuConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(config_values
               ) > 0, "Failed to get configuration using dcgmiVgpuConfigGet"

    assert config_values[0].mEccMode == (
        eccmodeOnGroupToSet), "ECC mode different from the set value"