def test_dcgm_vgpu_config_get_validate(handle): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList ) >= 0, "Not able to find devices on the node for embedded case" groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) status_handle = dcgm_agent.dcgmStatusCreate() with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmVgpuConfigGet(handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmVgpuConfigGet(handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle, versionTest)
def helper_verify_power_value_standalone(handle, groupId, expected_power): """ Helper Method to verify power value """ groupInfo = dcgm_agent.dcgmGroupGetInfo( handle, groupId, dcgm_structs.c_dcgmGroupInfo_version2) status_handle = dcgm_agent.dcgmStatusCreate() config_values = dcgm_agent.dcgmConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( config_values) > 0, "Failed to get configuration using dcgmConfigGet" for x in range(0, groupInfo.count): if (config_values[x].mPowerLimit.val != dcgmvalue.DCGM_INT32_NOT_SUPPORTED): assert config_values[x].mPowerLimit.type == dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL, \ "The power limit type for gpuId %d is incorrect. Returned: %d Expected :%d" \ % (x, config_values[x].mPowerLimit.type, dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL) assert config_values[x].mPowerLimit.val == expected_power, "The power limit value for gpuID %d is incorrect. Returned: %d Expected: %d" \ % (x, config_values[x].mPowerLimit.val, expected_power) pass ret = dcgm_agent.dcgmStatusDestroy(status_handle) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to remove status handler, error: %s" % ret
def GetGpuIds(self): groupInfo = dcgm_agent.dcgmGroupGetInfo(self._dcgmHandle.handle, self._groupId) groupGpuIds = [] for i in range(groupInfo.count): if groupInfo.entityList[i].entityGroupId != dcgm_fields.DCGM_FE_GPU: continue groupGpuIds.append(groupInfo.entityList[i].entityId) return groupGpuIds
def helper_verify_config_values_standalone(handle, groupId, expected_power, expected_ecc, \ expected_proc_clock, expected_mem_clock, expected_compute_mode, \ expected_sync_boost, expected_auto_boost): """ Helper Method to verify all the values for the current configuration are as expected """ groupInfo = dcgm_agent.dcgmGroupGetInfo( handle, groupId, dcgm_structs.c_dcgmGroupInfo_version2) status_handle = dcgm_agent.dcgmStatusCreate() config_values = dcgm_agent.dcgmConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( config_values) > 0, "Failed to get configuration using dcgmConfigGet" for x in xrange(0, groupInfo.count): assert config_values[x].mPowerLimit.type == dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL, \ "The power limit type for gpuId %d is incorrect. Returned: %d Expected :%d" \ % (x, config_values[x].mPowerLimit.type, dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL) assert config_values[x].mPowerLimit.val == expected_power, "The power limit value for gpuID %d is incorrect. Returned: %d Expected: %d" \ % (x, config_values[x].mPowerLimit.val, expected_power) assert config_values[x].mPerfState.syncBoost == expected_sync_boost, "The syncboost value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values[x].mPerfState.syncBoost, expected_sync_boost) assert config_values[x].mPerfState.autoBoost == expected_auto_boost, "The autoboost value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values[x].mPerfState.autoBoost, expected_auto_boost) assert config_values[x].mPerfState.minVPState.memClk == expected_mem_clock, "The min mem clock value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values.mPerfState.minVPState.memClk , expected_mem_clock) assert config_values[x].mPerfState.minVPState.procClk == expected_proc_clock, "The min proc clock value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values[x].mPerfState.minVPState.procClk , expected_proc_clock) assert config_values[x].mComputeMode == expected_compute_mode, "The compute mode value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values[x].mComputeMode, expected_compute_mode) assert config_values[x].mEccMode == expected_ecc, "The ecc mode value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values[x].mEccMode, expected_ecc) pass ret = dcgm_agent.dcgmStatusDestroy(status_handle) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to remove status handler, error: %s" % ret
def main(): ## Initilaize the DCGM Engine as manual operation mode. This implies that it's execution is ## controlled by the monitoring agent. The user has to periodically call APIs such as ## dcgmEnginePolicyTrigger and dcgmEngineUpdateAllFields which tells DCGM to wake up and ## perform data collection and operations needed for policy management. with RunDCGM('127.0.0.1', dcgm_structs.DCGM_OPERATION_MODE_MANUAL) as handle: ## Create a default group. (Default group is comprised of all the GPUs on the node) ## Let's call the group as "all_gpus_group". The method returns an opaque handle (groupId) to ## identify the newly created group. groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "all_gpus_group") ## Invoke method to get information on the newly created group groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) ## Create reference to DCGM status handler which can be used to get the statuses for multiple ## operations on one or more devices present in the group status_handle = dcgm_agent.dcgmStatusCreate() ## The worker function can be executed as a separate thread or as part of the main thread. ## Executed as a separate thread here thread = Thread(target=agent_worker_function, args=(handle, groupId)) thread.start() ########################################## # Any other useful work can be placed here ########################################## thread.join() print("Worker thread completed") ## Destroy the group try: dcgm_agent.dcgmGroupDestroy(handle, groupId) except dcgm_structs.DCGMError as e: print("Failed to remove the test group, error: %s" % e, file=sys.stderr) sys.exit(1) ## Destroy the status handle try: dcgm_agent.dcgmStatusDestroy(status_handle) except dcgm_structs.DCGMError as e: print("Failed to remove status handler, error: %s" % e, file=sys.stderr) sys.exit(1)
def dcgm_group_test_default_group(handle, gpuIds): """ Test that the default group can not be deleted, or manipulated and is returning all GPUs. Note that we're not using groupObj for some tests because it protects against operations on the default group """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetDefaultGroup() gpuIdList = gpuIds assert len(gpuIdList) > 0, "Failed to get devices from the node" with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, 9999) groupGpuIdList = groupObj.GetGpuIds() assert (gpuIdList == groupGpuIdList ), "Expected gpuId list match %s != %s" % (str(gpuIdList), str(groupGpuIdList)) groupEntityList = groupObj.GetEntities() gpuIdList2 = [] for entity in groupEntityList: assert entity.entityGroupId == dcgm_fields.DCGM_FE_GPU, str( entity.entityGroupId) gpuIdList2.append(entity.entityId) assert gpuIdList == gpuIdList2, "Expected gpuId list to match entity list: %s != %s" % ( str(gpuIdList), str(gpuIdList2)) for gpuId in gpuIdList: with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): ret = dcgm_agent.dcgmGroupRemoveDevice( handle, dcgm_structs.DCGM_GROUP_ALL_GPUS, gpuId) with test_utils.assert_raises(pydcgm.DcgmException): groupObj.RemoveGpu(gpuId) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): ret = dcgm_agent.dcgmGroupDestroy(handle, dcgm_structs.DCGM_GROUP_ALL_GPUS)
def GetGpus(self): """ Populate self.gpus """ self.groupId = dcgm_agent.dcgmGroupCreate( self.heHandle, dcgm_structs.DCGM_GROUP_DEFAULT, self.groupName) groupInfo = dcgm_agent.dcgmGroupGetInfo( self.heHandle, self.groupId, dcgm_structs.c_dcgmGroupInfo_version2) gpuIds = groupInfo.gpuIdList[0:groupInfo.count] self.Log("Running on %d GPUs" % len(gpuIds)) for gpuId in gpuIds: newGpu = ProcessStatsStressGpu() newGpu.gpuId = gpuId self.gpus.append(newGpu) #Get the busid of the GPU fieldId = dcgm_fields.DCGM_FI_DEV_PCI_BUSID updateFreq = 100000 maxKeepAge = 3600.0 #one hour maxKeepEntries = 0 #no limit dcgm_agent_internal.dcgmWatchFieldValue(self.heHandle, gpuId, fieldId, updateFreq, maxKeepAge, maxKeepEntries) #Update all of the new watches dcgm_agent.dcgmUpdateAllFields(self.heHandle, 1) for gpu in self.gpus: values = dcgm_agent_internal.dcgmGetLatestValuesForFields( self.heHandle, gpuId, [ fieldId, ]) busId = values[0].value.str gpu.busId = busId self.Log(" GPUID %d, busId %s" % (gpu.gpuId, gpu.busId))
def test_dcgm_connection_client_cleanup(handle, gpuIds): ''' Make sure that resources that were allocated by a client are cleaned up ''' fieldGroupFieldIds = [ dcgm_fields.DCGM_FI_DEV_GPU_TEMP, ] #Get a 2nd connection which we'll check for cleanup. Use the raw APIs so we can explicitly cleanup connectParams = dcgm_structs.c_dcgmConnectV2Params_v1() connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version connectParams.persistAfterDisconnect = 0 cleanupHandle = dcgm_agent.dcgmConnect_v2('localhost', connectParams) groupName = 'clientcleanupgroup' groupId = dcgm_agent.dcgmGroupCreate(cleanupHandle, dcgm_structs.DCGM_GROUP_EMPTY, groupName) fieldGroupName = 'clientcleanupfieldgroup' fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(cleanupHandle, fieldGroupFieldIds, fieldGroupName) #Disconnect our second handle. This should cause the cleanup to occur dcgm_agent.dcgmDisconnect(cleanupHandle) time.sleep(1.0) #Allow connection cleanup to occur since it's asynchronous #Try to retrieve the field group info. This should throw an exception with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA)): fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(handle, fieldGroupId) #Try to retrieve the group info. This should throw an exception with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NOT_CONFIGURED)): groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
def helper_dcgm_group_get_grp_info(handle, gpuIds): handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") gpuIdList = gpuIds assert len(gpuIdList) > 0, "Failed to get devices from the node" for gpuId in gpuIdList: groupObj.AddGpu(gpuId) # We used to test fetching negative value throws Bad Param error here. # This was only a usecase because we we mixing signed and unsigned values # Now we're just testing that passing an invalid group ID results in the # expected NOT_CONFIGURED error. with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): ret = dcgm_agent.dcgmGroupGetInfo(handle, -1) gpuIdListAfterAdd = groupObj.GetGpuIds() assert gpuIdList == gpuIdListAfterAdd, "Expected all GPUs from %s to be added. Got %s" % ( str(gpuIdList), str(gpuIdListAfterAdd))
def GetEntities(self): groupInfo = dcgm_agent.dcgmGroupGetInfo(self._dcgmHandle.handle, self._groupId) entities = groupInfo.entityList[0:groupInfo.count] return entities
def test_dcgm_run_diagnostic_validate(handle, gpuIds): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList ) >= 0, "Not able to find devices on the node for embedded case" groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) status_handle = dcgm_agent.dcgmStatusCreate() diagLevel = dcgm_structs.DCGM_DIAG_LVL_SHORT gpuIdStr = "" for i, gpuId in enumerate(gpuIds): if i > 0: gpuIdStr += "," gpuIdStr += str(gpuId) drd = dcgm_structs.c_dcgmRunDiag_t() drd.version = dcgm_structs.dcgmRunDiag_version drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT drd.groupId = groupId drd.gpuList = gpuIdStr with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmActionValidate_v2(handle, drd, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmActionValidate_v2(handle, drd, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmActionValidate(handle, drd.groupId, drd.validate, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmActionValidate(handle, drd.groupId, drd.validate, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmRunDiagnostic(handle, drd.groupId, diagLevel, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmRunDiagnostic(handle, drd.groupId, diagLevel, versionTest)
## Entry point for this script if __name__ == "__main__": ## Initialize the DCGM Engine as manual operation mode. This implies that it's execution is ## controlled by the monitoring agent. The user has to periodically call APIs such as ## dcgmEnginePolicyTrigger and dcgmEngineUpdateAllFields which tells DCGM to wake up and ## perform data collection and operations needed for policy management. with RunDCGM('127.0.0.1', dcgm_structs.DCGM_OPERATION_MODE_MANUAL) as handle: ## Create a default group. (Default group is comprised of all the GPUs on the node) ## Let's call the group as "all_gpus_group". The method returns an opaque handle (groupId) to ## identify the newly created group. groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "all_gpus_group") ## Invoke method to get information on the newly created group groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) ## Create reference to DCGM status handler which can be used to get the statuses for multiple ## operations on one or more devices present in the group status_handle = dcgm_agent.dcgmStatusCreate() ## The worker function can be executed as a separate thread or as part of the main thread. ## Executed as a separate thread here thread = Thread(target = agent_worker_function, args = (handle, groupId, groupInfo, status_handle)) thread.start() ########################################## # Any other useful work can be placed here ########################################## thread.join()
dcgm_fields.DCGM_FI_DEV_AUTOBOOST, ]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): autoBoost_set = 1 print "configure autobost" assert attributesForDevices[ 0].vpStates.count > 0, "Can't find clocks for the device" total_clocks = attributesForDevices[0].vpStates.count proc_clk_set = attributesForDevices[0].vpStates.vpState[total_clocks / 2].procClk mem_clk_set = attributesForDevices[0].vpStates.vpState[total_clocks / 2].memClk ## Always Switch the ecc mode ecc_set = 1 groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId, dcgm_structs.c_dcgmGroupInfo_version2) config_values = dcgm_agent.dcgmConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, 0) assert len(config_values) > 0, "Failed to work with NULL status handle" eccmodeOnGroupExisting = config_values[0].mEccMode if eccmodeOnGroupExisting == 0: ecc_set = 1 else: ecc_set = 0 syncboost_set = 1 compute_set = dcgm_structs.DCGM_CONFIG_COMPUTEMODE_DEFAULT config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
## perform data collection and operations needed for policy management. with RunDCGM('127.0.0.1', dcgm_structs.DCGM_OPERATION_MODE_MANUAL) as handle: # The validate information should be packed in the dcgmRunDiag object runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7() runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7 ## Create a default group. (Default group is comprised of all the GPUs on the node) ## Let's call the group as "all_gpus_group". The method returns an opaque handle (groupId) to ## identify the newly created group. runDiagInfo.groupId = dcgm_agent.dcgmGroupCreate( handle, dcgm_structs.DCGM_GROUP_DEFAULT, "all_gpus_group") ## Invoke method to get information on the newly created group groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, runDiagInfo.groupId) ## define the actions and validations for those actions to take place runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT ## This will go ahead and perform a "prologue" diagnostic ## to make sure everything is ready to run ## currently this calls an outside diagnostic binary but eventually ## that binary will be merged into the DCGM framework ## The "response" is a dcgmDiagResponse structure that can be parsed for errors response = dcgm_agent.dcgmActionValidate_v2(handle, runDiagInfo) ## This will perform an "eiplogue" diagnostic that will stress the system ## Currently commented out because it takes several minutes to execute # runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_SV_LONG #response = dcgm_agent.dcgmActionValidate_v2(handle, dcgmRunDiagInfo)
def test_dcgm_configure_ecc_mode(handle, gpuIds): test_utils.skip_test("Skipping this test until bug 200377294 is fixed") groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY, "test1") validDevice = -1 for x in gpuIds: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [ dcgm_fields.DCGM_FI_DEV_ECC_CURRENT, ]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDevice = x break if (validDevice == -1): test_utils.skip_test( "Can only run if at least one GPU with ECC is present") ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to add a device to the group %d. Return %d" % ( groupId.value, ret) groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) #Create a status handle status_handle = dcgm_agent.dcgmStatusCreate() ## Get original ECC mode on the device config_values = dcgm_agent.dcgmConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( config_values) > 0, "Failed to get configuration using dcgmConfigGet" eccmodeOnGroupExisting = config_values[0].mEccMode if eccmodeOnGroupExisting == 0: eccmodeOnGroupToSet = 1 else: eccmodeOnGroupToSet = 0 #print eccmodeOnGroupExisting #print eccmodeOnGroupToSet ## Toggle the ECC mode on the group config_values = dcgm_structs.c_dcgmDeviceConfig_v1() config_values.mEccMode = eccmodeOnGroupToSet config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret try: ret = dcgm_agent.dcgmConfigSet(handle, groupId, config_values, status_handle) except dcgm_structs.DCGMError as e: pass errors = helper_get_status_list(status_handle) if len(errors) > 0: for error in errors: if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED: test_utils.skip_test( "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d" % (error.fieldId, error.status)) else: test_utils.skip_test( "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d" % (error.fieldId, error.status)) #Sleep after reset time.sleep(2) #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret #Get the current configuration config_values = dcgm_agent.dcgmConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( config_values) > 0, "Failed to get configuration using dcgmConfigGet" fvs = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, validDevice, [ dcgm_fields.DCGM_FI_DEV_ECC_PENDING, dcgm_fields.DCGM_FI_DEV_ECC_CURRENT ]) if fvs[0].value.i64 != fvs[1].value.i64: logger.warning( "Pending ECC %d != Current ECC %d for gpuId %d. Box probably needs a reboot" % (fvs[0].value.i64, fvs[1].value.i64, validDevice)) else: assert config_values[0].mEccMode == (eccmodeOnGroupToSet), "ECC mode %d different from the set value %d" % \ (config_values[0].mEccMode, eccmodeOnGroupToSet)
def test_dcgm_vgpu_configure_ecc_mode(handle, gpuIds): test_utils.skip_test("Skipping this test until bug 200377294 is fixed") groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY, "test1") validDevice = -1 for x in gpuIds: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [dcgm_fields.DCGM_FI_DEV_RETIRED_DBE]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDevice = x break if (validDevice == -1): test_utils.skip_test( "Can only run if at least one GPU with ECC is present") ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice) assert ( ret == dcgm_structs.DCGM_ST_OK ), "Failed to add a device to the group %d. Return %d" % (groupId, ret) groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) #Create a status handle status_handle = dcgm_agent.dcgmStatusCreate() ## Get original ECC mode on the device vgpu_config_values = dcgm_agent_internal.dcgmVgpuConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( vgpu_config_values) > 0, "Failed to work with NULL status handle" eccmodeOnGroupExisting = vgpu_config_values[0].mEccMode if eccmodeOnGroupExisting == 0: eccmodeOnGroupToSet = 1 else: eccmodeOnGroupToSet = 0 #print eccmodeOnGroupExisting #print eccmodeOnGroupToSet ## Toggle the ECC mode on the group vgpu_config_values = dcgm_structs.c_dcgmDeviceVgpuConfig_v1() vgpu_config_values.mEccMode = eccmodeOnGroupToSet vgpu_config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK vgpu_config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK vgpu_config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret try: ret = dcgm_agent_internal.dcgmVgpuConfigSet(handle, groupId, vgpu_config_values, status_handle) except dcgm_structs.DCGMError as e: pass errors = helper_get_status_list(status_handle) if len(errors) > 0: for error in errors: if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED: test_utils.skip_test( "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d" % (error.fieldId, error.status)) else: test_utils.skip_test( "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d" % (error.fieldId, error.status)) #Sleep after reset and then apply update for it to occur time.sleep(2) dcgm_agent.dcgmUpdateAllFields(handle, 1) #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret #Get the current configuration config_values = dcgm_agent_internal.dcgmVgpuConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len(config_values ) > 0, "Failed to get configuration using dcgmiVgpuConfigGet" assert config_values[0].mEccMode == ( eccmodeOnGroupToSet), "ECC mode different from the set value"