def helper_test_inject_instance_fields(handle, gpuIds): instances, cis = ensure_instance_ids(handle, gpuIds[0], 1, 1) firstInstanceId = instances.keys()[0] lastCIId = cis.keys()[0] # Set up the watches on these groups groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY, 'tien') fieldGroupId = dcgm_agent.dcgmFieldGroupCreate( handle, [dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL], 'kal') dcgm_agent.dcgmGroupAddEntity(handle, groupId, dcgm_fields.DCGM_FE_GPU, gpuIds[0]) dcgm_agent.dcgmGroupAddEntity(handle, groupId, dcgm_fields.DCGM_FE_GPU_I, firstInstanceId) dcgm_agent.dcgmGroupAddEntity(handle, groupId, dcgm_fields.DCGM_FE_GPU_CI, lastCIId) dcgm_agent.dcgmWatchFields(handle, groupId, fieldGroupId, 1, 100, 100) dcgm_internal_helpers.inject_value( handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, 2, 5, isInt=True, verifyInsertion=True, entityType=dcgm_fields.DCGM_FE_GPU) # Read the values to make sure they were stored properly entities = [ dcgm_structs.c_dcgmGroupEntityPair_t(), dcgm_structs.c_dcgmGroupEntityPair_t(), dcgm_structs.c_dcgmGroupEntityPair_t() ] entities[0].entityGroupId = dcgm_fields.DCGM_FE_GPU_I entities[0].entityId = firstInstanceId entities[1].entityGroupId = dcgm_fields.DCGM_FE_GPU_CI entities[1].entityId = lastCIId entities[2].entityGroupId = dcgm_fields.DCGM_FE_GPU entities[2].entityId = gpuIds[0] fieldIds = [dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL] values = dcgm_agent.dcgmEntitiesGetLatestValues(handle, entities, fieldIds, 0) for v in values: if v.entityGroupId == dcgm_fields.DCGM_FE_GPU: assert v.value.i64 == 2, "Failed to inject value 2 for entity %u from group %u" % ( v.entityId, v.entityGroupId) else: from dcgm_structs import DCGM_ST_NO_DATA assert ( v.status == DCGM_ST_NO_DATA ), "Injected meaningless value %u for entity %u from group %u" % ( v.value.i64, v.entityId, v.entityGroupId)
def test_dcgm_prof_all_supported_fields_watchable(handle, gpuId): ''' Verify that all fields that are reported as supported are watchable and that values can be returned for them ''' dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', [gpuId]) helper_check_profiling_environment(dcgmGroup) fieldIds = helper_get_supported_field_ids(dcgmGroup) assert fieldIds is not None watchFreq = 1000 #1 ms maxKeepAge = 60.0 maxKeepSamples = 0 maxAgeUsec = int(maxKeepAge) * watchFreq entityPairList = [dcgm_structs.c_dcgmGroupEntityPair_t(dcgm_fields.DCGM_FE_GPU, gpuId)] for fieldId in fieldIds: # If there are only one unsupported SKUs in the group, WatchFields should return an error. # If at least one GPU in the group is supported, WatchFields will be successful. # The described logic is used to skip unsupported or fake SKUs. if dcgmGroup.profiling.WatchFields([fieldId, ], watchFreq, maxKeepAge, maxKeepSamples) == dcgm_structs.DCGM_ST_PROFILING_NOT_SUPPORTED: test_utils.skip_test_supported("DCP") # Sending a request to the profiling manager guarantees that an update cycle has happened since # the last request dcgmGroup.profiling.GetSupportedMetricGroups() # validate watch freq, quota, and watched flags cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(handle, gpuId, fieldId) assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED) != 0, "gpuId %u, fieldId %u not watched" % (gpuId, fieldId) assert cmfi.numSamples > 0 assert cmfi.numWatchers == 1, "numWatchers %d" % cmfi.numWatchers assert cmfi.monitorFrequencyUsec == watchFreq, "monitorFrequencyUsec %u != watchFreq %u" % (cmfi.monitorFrequencyUsec, watchFreq) assert cmfi.lastStatus == dcgm_structs.DCGM_ST_OK, "lastStatus %u != DCGM_ST_OK" % (cmfi.lastStatus) fieldValues = dcgm_agent.dcgmEntitiesGetLatestValues(handle, entityPairList, [fieldId, ], 0) for i, fieldValue in enumerate(fieldValues): logger.debug(str(fieldValue)) assert(fieldValue.status == dcgm_structs.DCGM_ST_OK), "idx %d status was %d" % (i, fieldValue.status) assert(fieldValue.ts != 0), "idx %d timestamp was 0" % (i) dcgmGroup.profiling.UnwatchFields() #Validate watch flags after unwatch cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(handle, gpuId, fieldId) assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED) == 0, "gpuId %u, fieldId %u still watched. flags x%X" % (gpuId, fieldId, cmfi.flags) assert cmfi.numWatchers == 0, "numWatchers %d" % cmfi.numWatchers
def verify_profile_names_exist(handle, migEntityList, isGpuInstance): fieldIds = [dcgm_fields.DCGM_FI_DEV_NAME] entities = [] for entityId in migEntityList: entity = dcgm_structs.c_dcgmGroupEntityPair_t() if isGpuInstance: entity.entityGroupId = dcgm_fields.DCGM_FE_GPU_I else: entity.entityGroupId = dcgm_fields.DCGM_FE_GPU_CI entity.entityId = entityId entities.append(entity) values = dcgm_agent.dcgmEntitiesGetLatestValues( handle, entities, fieldIds, dcgm_structs.DCGM_FV_FLAG_LIVE_DATA) for v in values: assert len(v.value.str) and v.value.str != dcgmvalue.DCGM_STR_BLANK, \ "Expected a non-empty profile name, but found '%s'" % (v.value.str)
def verify_fake_profile_names(handle, fakeEntities, isGpuInstance): fieldIds = [dcgm_fields.DCGM_FI_DEV_NAME] entities = [] for entityId in fakeEntities: entity = dcgm_structs.c_dcgmGroupEntityPair_t() if isGpuInstance: entity.entityGroupId = dcgm_fields.DCGM_FE_GPU_I else: entity.entityGroupId = dcgm_fields.DCGM_FE_GPU_CI entity.entityId = entityId entities.append(entity) values = dcgm_agent.dcgmEntitiesGetLatestValues( handle, entities, fieldIds, dcgm_structs.DCGM_FV_FLAG_LIVE_DATA) if isGpuInstance: expectedFakeName = "1fg.4gb" else: expectedFakeName = "1fc.1g.4gb" for v in values: assert v.value.str == expectedFakeName, "Fake profile name appears to be wrong. Expected '%s', found '%s'" % ( expectedFakeName, v.value.str)
def test_dcgm_prof_all_supported_fields_watchable(handle, gpuIds): ''' Verify that all fields that are reported as supported are watchable and that values can be returned for them ''' dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) fieldIds = helper_get_supported_field_ids(dcgmGroup) assert fieldIds is not None watchFreq = 1000 #1 ms maxKeepAge = 60.0 maxKeepSamples = 0 maxAgeUsec = int(maxKeepAge) * watchFreq entityPairList = [] for gpuId in gpuIds: entityPairList.append( dcgm_structs.c_dcgmGroupEntityPair_t(dcgm_fields.DCGM_FE_GPU, gpuId)) for fieldId in fieldIds: dcgmGroup.profiling.WatchFields([ fieldId, ], watchFreq, maxKeepAge, maxKeepSamples) # Sending a request to the profiling manager guarantees that an update cycle has happened since # the last request dcgmGroup.profiling.GetSupportedMetricGroups() # validate watch freq, quota, and watched flags for gpuId in gpuIds: cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fieldId) assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED ) != 0, "gpuId %u, fieldId %u not watched" % (gpuId, fieldId) assert cmfi.numSamples > 0 assert cmfi.numWatchers == 1, "numWatchers %d" % cmfi.numWatchers assert cmfi.monitorFrequencyUsec == watchFreq, "monitorFrequencyUsec %u != watchFreq %u" % ( cmfi.monitorFrequencyUsec, watchFreq) assert cmfi.lastStatus == dcgm_structs.DCGM_ST_OK, "lastStatus %u != DCGM_ST_OK" % ( cmfi.lastStatus) fieldValues = dcgm_agent.dcgmEntitiesGetLatestValues( handle, entityPairList, [ fieldId, ], 0) for i, fieldValue in enumerate(fieldValues): logger.debug(str(fieldValue)) assert (fieldValue.status == dcgm_structs.DCGM_ST_OK ), "idx %d status was %d" % (i, fieldValue.status) assert (fieldValue.ts != 0), "idx %d timestamp was 0" % (i) dcgmGroup.profiling.UnwatchFields() #Validate watch flags after unwatch for gpuId in gpuIds: cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fieldId) assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED ) == 0, "gpuId %u, fieldId %u still watched. flags x%X" % ( gpuId, fieldId, cmfi.flags) assert cmfi.numWatchers == 0, "numWatchers %d" % cmfi.numWatchers
def helper_test_mig_value_reporting(handle, gpuIds): # These fields should report the same value for GPUs, instances, and compute instances sameValueFieldIds = [ dcgm_fields.DCGM_FI_DEV_COMPUTE_MODE, dcgm_fields.DCGM_FI_DEV_MIG_MODE, dcgm_fields.DCGM_FI_DEV_SHUTDOWN_TEMP, ] differentValueFieldIds = [ dcgm_fields.DCGM_FI_DEV_FB_TOTAL, ] newGpuInstances, newComputeInstances = create_mig_entities_and_verify( handle, gpuIds, 3, 1) # Make sure we get the same values for these fields on the GPU, instances, and compute instances # Build the entity list entities = [] for gpuId in gpuIds: entities.append( dcgm_structs.c_dcgmGroupEntityPair_t(dcgm_fields.DCGM_FE_GPU, gpuId)) for instanceId in newGpuInstances: entities.append( dcgm_structs.c_dcgmGroupEntityPair_t(dcgm_fields.DCGM_FE_GPU_I, instanceId)) for ciId in newComputeInstances: entities.append( dcgm_structs.c_dcgmGroupEntityPair_t(dcgm_fields.DCGM_FE_GPU_CI, ciId)) fieldIds = [] fieldIds.extend(sameValueFieldIds) fieldIds.extend(differentValueFieldIds) values = dcgm_agent.dcgmEntitiesGetLatestValues( handle, entities, fieldIds, dcgm_structs.DCGM_FV_FLAG_LIVE_DATA) gpuValues = {} # Make a map of a map the values reported by the GPUs: gpuId -> fieldId -> value for value in values: if value.entityGroupId == dcgm_fields.DCGM_FE_GPU: if value.entityId not in gpuValues: gpuValues[value.entityId] = {} gpuValues[value.entityId][value.fieldId] = value.value.i64 elif value.fieldId not in gpuValues[value.entityId]: gpuValues[value.entityId][value.fieldId] = value.value.i64 errMsg = '' for value in values: if value.entityGroupId == dcgm_fields.DCGM_FE_GPU_I: gpuId = value.entityId % dcgm_structs.DCGM_MAX_INSTANCES_PER_GPU same = gpuValues[gpuId][value.fieldId] == value.value.i64 if not same and value.fieldId in sameValueFieldIds: errMsg = errMsg + "\nExpected %d but found %d for field %d GPU instance %d on GPU %d" \ % (gpuValues[gpuId][value.fieldId], value.value.i64, value.fieldId, value.entityId, gpuId) elif same and value.fieldId in differentValueFieldIds: errMsg = errMsg + "\nExpected different values but found %d for field %d for GPU instance %d on GPU %d" \ % (value.value.i64, value.fieldId, value.entityId, gpuId) if value.entityGroupId == dcgm_fields.DCGM_FE_GPU_CI: gpuId = value.entityId % dcgm_structs.DCGM_MAX_COMPUTE_INSTANCES_PER_GPU same = gpuValues[gpuId][value.fieldId] == value.value.i64 if not same and value.fieldId in sameValueFieldIds: errMsg = errMsg + "\nExpected %d but found %d for field %d compute instance %d on GPU %d" \ % (gpuValues[gpuId][value.fieldId], value.value.i64, value.fieldId, value.entityId, gpuId) elif same and value.fieldId in differentValueFieldIds: errMsg = errMsg + "\nExpected different values but found %d for field %d for compute instance %d on GPU %d" \ % (value.value.i64, value.fieldId, value.entityId, gpuId) ciFailMsg = delete_compute_instances_and_verify(handle, newComputeInstances) instanceFailMsg = delete_gpu_instances_and_verify(handle, newGpuInstances) if ciFailMsg != '': logger.warning("The compute instances didn't clean up correctly: %s" % ciFailMsg) if instanceFailMsg != '': logger.warning("The GPU instances didn't clean up correctly: %s" % instanceFailMsg) assert errMsg == '', errMsg