def test_dcgm_agent_get_values_for_fields(handle, gpuIds): """ Verifies that DCGM Engine can be initialized successfully """ # Watch field so we can fetch it fieldId = dcgm_fields.DCGM_FI_DEV_NAME gpuId = gpuIds[0] ret = dcgm_agent_internal.dcgmWatchFieldValue(handle, gpuId, fieldId, 10000000, 86400.0, 0) assert (ret == dcgm_structs.DCGM_ST_OK) # wait for at least one update of the field before trying to read it ret = dcgm_agent.dcgmUpdateAllFields(handle, True) assert (ret == dcgm_structs.DCGM_ST_OK) values = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, gpuId, [ fieldId, ]) assert values[0].status == dcgm_structs.DCGM_ST_OK assert chr( values[0].fieldType ) == dcgm_fields.DCGM_FT_STRING, "Wrong field type: %s" % values[ 0].fieldType assert len(values[0].value.str) > 0 logger.debug("Brand of GPU %u is %s" % (gpuId, values[0].value.str))
def GetGpus(self): """ Populate self.gpus """ self.groupId = dcgm_agent.dcgmGroupCreate( self.heHandle, dcgm_structs.DCGM_GROUP_DEFAULT, self.groupName) groupInfo = dcgm_agent.dcgmGroupGetInfo( self.heHandle, self.groupId, dcgm_structs.c_dcgmGroupInfo_version2) gpuIds = groupInfo.gpuIdList[0:groupInfo.count] self.Log("Running on %d GPUs" % len(gpuIds)) for gpuId in gpuIds: newGpu = ProcessStatsStressGpu() newGpu.gpuId = gpuId self.gpus.append(newGpu) #Get the busid of the GPU fieldId = dcgm_fields.DCGM_FI_DEV_PCI_BUSID updateFreq = 100000 maxKeepAge = 3600.0 #one hour maxKeepEntries = 0 #no limit dcgm_agent_internal.dcgmWatchFieldValue(self.heHandle, gpuId, fieldId, updateFreq, maxKeepAge, maxKeepEntries) #Update all of the new watches dcgm_agent.dcgmUpdateAllFields(self.heHandle, 1) for gpu in self.gpus: values = dcgm_agent_internal.dcgmGetLatestValuesForFields( self.heHandle, gpuId, [ fieldId, ]) busId = values[0].value.str gpu.busId = busId self.Log(" GPUID %d, busId %s" % (gpu.gpuId, gpu.busId))
def helper_test_dcgm_policy_inject_xiderror(handle, gpuIds): """ Verifies that we can inject an XID error and receive a callback """ newPolicy = dcgm_structs.c_dcgmPolicy_v1() newPolicy.version = dcgm_structs.dcgmPolicy_version1 newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_XID newPolicy.parms[6].tag = 0 newPolicy.parms[6].val.boolean = True dcgmHandle = pydcgm.DcgmHandle(handle) validDeviceId = -1 devices = gpuIds for x in devices: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [ dcgm_fields.DCGM_FI_DEV_XID_ERRORS, ]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDeviceId = x break if (validDeviceId == -1): test_utils.skip_test( "Can only run if at least one GPU that supports XID errors is present" ) group = pydcgm.DcgmGroup(dcgmHandle, groupName="test1", groupType=dcgm_structs.DCGM_GROUP_EMPTY) group.AddGpu(validDeviceId) group.policy.Set(newPolicy) callbackQueue = Queue.Queue() c_callback = create_c_callback(callbackQueue) group.policy.Register(dcgm_structs.DCGM_POLICY_COND_XID, finishCallback=c_callback) field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_XID_ERRORS field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() + 60) * 1000000.0) # set the injected data into the future field.value.i64 = 16 ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, validDeviceId, field) assert (ret == dcgm_structs.DCGM_ST_OK) # wait for the the policy manager to call back try: callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS) except Queue.Empty: assert False, "Callback never happened" # check that the callback occurred with the correct arguments assert(dcgm_structs.DCGM_POLICY_COND_XID == callbackData.condition), \ ("XID error callback was not for a XID error, got: %s" % callbackData.condition) assert (16 == callbackData.val.xid.errnum), ( 'Expected XID error 16 but got %s' % callbackData.val.xid.errnum)
% (x, config_values[x].mEccMode, expected_ecc) pass ret = dcgm_agent.dcgmStatusDestroy(status_handle) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to remove status handler, error: %s" % ret dcgm_structs._LoadDcgmLibrary() handle = dcgm_agent.dcgmInit() devices = dcgm_agent.dcgmGetAllDevices(handle) validDevices = list() for x in devices: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [ dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, ]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDevices.append(x) if (len(validDevices) == 0): print "Can only run if at least one GPU with ECC is present" sys.exit(1) print "Number of valid devices: %d" % len(validDevices) groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY, "test1") statusHandle = dcgm_agent.dcgmStatusCreate() for device in validDevices:
newPolicy = dcgm_structs.c_dcgmPolicy_v1() handle = dcgm_agent.dcgmInit() newPolicy.version = dcgm_structs.dcgmPolicy_version1 newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED newPolicy.action = dcgm_structs.DCGM_POLICY_ACTION_GPURESET newPolicy.validation = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT newPolicy.parms[2].tag = 1 newPolicy.parms[2].val.llval = 5 # find a GPU that supports retired pages (otherwise internal test will ignore it) devices = dcgm_agent.dcgmGetAllDevices(handle) validDevice = -1 for x in devices: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [ dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, ]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDevice = x break if (validDevice == -1): print("Can only run if at least one GPU with ECC is present") sys.exit(1) groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY, "test1") statusHandle = dcgm_agent.dcgmStatusCreate() ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice) assert (ret == dcgm_structs.DCGM_ST_OK)
def test_dcgm_configure_ecc_mode(handle, gpuIds): test_utils.skip_test("Skipping this test until bug 200377294 is fixed") groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY, "test1") validDevice = -1 for x in gpuIds: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [ dcgm_fields.DCGM_FI_DEV_ECC_CURRENT, ]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDevice = x break if (validDevice == -1): test_utils.skip_test( "Can only run if at least one GPU with ECC is present") ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to add a device to the group %d. Return %d" % ( groupId.value, ret) groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) #Create a status handle status_handle = dcgm_agent.dcgmStatusCreate() ## Get original ECC mode on the device config_values = dcgm_agent.dcgmConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( config_values) > 0, "Failed to get configuration using dcgmConfigGet" eccmodeOnGroupExisting = config_values[0].mEccMode if eccmodeOnGroupExisting == 0: eccmodeOnGroupToSet = 1 else: eccmodeOnGroupToSet = 0 #print eccmodeOnGroupExisting #print eccmodeOnGroupToSet ## Toggle the ECC mode on the group config_values = dcgm_structs.c_dcgmDeviceConfig_v1() config_values.mEccMode = eccmodeOnGroupToSet config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret try: ret = dcgm_agent.dcgmConfigSet(handle, groupId, config_values, status_handle) except dcgm_structs.DCGMError as e: pass errors = helper_get_status_list(status_handle) if len(errors) > 0: for error in errors: if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED: test_utils.skip_test( "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d" % (error.fieldId, error.status)) else: test_utils.skip_test( "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d" % (error.fieldId, error.status)) #Sleep after reset time.sleep(2) #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret #Get the current configuration config_values = dcgm_agent.dcgmConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( config_values) > 0, "Failed to get configuration using dcgmConfigGet" fvs = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, validDevice, [ dcgm_fields.DCGM_FI_DEV_ECC_PENDING, dcgm_fields.DCGM_FI_DEV_ECC_CURRENT ]) if fvs[0].value.i64 != fvs[1].value.i64: logger.warning( "Pending ECC %d != Current ECC %d for gpuId %d. Box probably needs a reboot" % (fvs[0].value.i64, fvs[1].value.i64, validDevice)) else: assert config_values[0].mEccMode == (eccmodeOnGroupToSet), "ECC mode %d different from the set value %d" % \ (config_values[0].mEccMode, eccmodeOnGroupToSet)
def test_dcgm_vgpu_configure_ecc_mode(handle, gpuIds): test_utils.skip_test("Skipping this test until bug 200377294 is fixed") groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY, "test1") validDevice = -1 for x in gpuIds: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [dcgm_fields.DCGM_FI_DEV_RETIRED_DBE]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDevice = x break if (validDevice == -1): test_utils.skip_test( "Can only run if at least one GPU with ECC is present") ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice) assert ( ret == dcgm_structs.DCGM_ST_OK ), "Failed to add a device to the group %d. Return %d" % (groupId, ret) groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) #Create a status handle status_handle = dcgm_agent.dcgmStatusCreate() ## Get original ECC mode on the device vgpu_config_values = dcgm_agent_internal.dcgmVgpuConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( vgpu_config_values) > 0, "Failed to work with NULL status handle" eccmodeOnGroupExisting = vgpu_config_values[0].mEccMode if eccmodeOnGroupExisting == 0: eccmodeOnGroupToSet = 1 else: eccmodeOnGroupToSet = 0 #print eccmodeOnGroupExisting #print eccmodeOnGroupToSet ## Toggle the ECC mode on the group vgpu_config_values = dcgm_structs.c_dcgmDeviceVgpuConfig_v1() vgpu_config_values.mEccMode = eccmodeOnGroupToSet vgpu_config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK vgpu_config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK vgpu_config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret try: ret = dcgm_agent_internal.dcgmVgpuConfigSet(handle, groupId, vgpu_config_values, status_handle) except dcgm_structs.DCGMError as e: pass errors = helper_get_status_list(status_handle) if len(errors) > 0: for error in errors: if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED: test_utils.skip_test( "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d" % (error.fieldId, error.status)) else: test_utils.skip_test( "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d" % (error.fieldId, error.status)) #Sleep after reset and then apply update for it to occur time.sleep(2) dcgm_agent.dcgmUpdateAllFields(handle, 1) #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret #Get the current configuration config_values = dcgm_agent_internal.dcgmVgpuConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len(config_values ) > 0, "Failed to get configuration using dcgmiVgpuConfigGet" assert config_values[0].mEccMode == ( eccmodeOnGroupToSet), "ECC mode different from the set value"
def test_dcgm_injection_agent(handle, gpuIds): """ Verifies that injection works with the agent host engine """ gpuId = gpuIds[0] #Make a base value that is good for starters fvGood = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() fvGood.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 fvGood.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_CURRENT fvGood.status = 0 fvGood.fieldType = ord(dcgm_fields.DCGM_FT_INT64) fvGood.ts = get_usec_since_1970() fvGood.value.i64 = 1 fieldInfoBefore = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fvGood.fieldId) countBefore = fieldInfoBefore.numSamples #This will throw an exception if it fails dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvGood) fieldInfoAfter = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fvGood.fieldId) countAfter = fieldInfoAfter.numSamples assert countAfter > countBefore, "Expected countAfter %d > countBefore %d after injection" % ( countAfter, countBefore) #Fetch the value we just inserted and verify its attributes are the same fvFetched = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, gpuId, [ fvGood.fieldId, ])[0] helper_verify_fv_equal(fvFetched, fvGood) #Should be able to insert a null timestamp. The agent will just use "now" fvAlsoGood = fvGood fvAlsoGood.ts = 0 #This will thrown an exception if it fails dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvAlsoGood) #Now make some attributes bad and expect an error fvBad = fvGood fvBad.fieldType = ord(dcgm_fields.DCGM_FT_DOUBLE) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)): dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad) fvGood.fieldType = ord(dcgm_fields.DCGM_FT_INT64) """ TODO: DCGM-2130 - Restore this test when protobuf is removed #Now make some attributes bad and expect an error fvBad = fvGood fvBad.version = 0 with test_utils.assert_raises(dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad) fvGood.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 """ fvBad = fvGood fvBad.fieldId = dcgm_fields.DCGM_FI_MAX_FIELDS + 100 with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)): dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad)