def main(): print('Some examples of different DcgmReader usages') print('\n\nThe default interaction') dr = DcgmReader() dr.Process() print('\n\nUsing custom fields through the dictionary interface...') customFields = [ dcgm_fields.DCGM_FI_DEV_MEM_COPY_UTIL, dcgm_fields.DCGM_FI_DEV_GPU_UTIL, dcgm_fields.DCGM_FI_DEV_POWER_USAGE ] DcgmReaderDictionary(field_ids=customFields) print( '\n\nProcessing in field order by overriding the CustomerDataHandler() method' ) cdr = DataHandlerReader() cdr.Process() print( '\n\nPrinting a little differently by overriding the CustomFieldHandler() method' ) fhr = FieldHandlerReader() fhr.Process()
def test_reading_pid_fields(handle, gpuIds, cudaApp): """ Verifies that we can decode PID structs """ fieldTag = dcgm_fields.DCGM_FI_DEV_COMPUTE_PIDS pids = [] dr = DcgmReader(fieldIds=[fieldTag], updateFrequency=100000) logger.debug("Trying for 2 seconds") exit_loop = False for _ in range(10): if (exit_loop): break data = dr.GetLatestGpuValuesAsFieldIdDict() assert len(data) > 0 for gpuId in data: gpuData = data[gpuId] if fieldTag in gpuData: pids.append(gpuData[fieldTag].pid) if gpuData[fieldTag].pid == cudaApp.getpid(): # Found our PID. Exit the loop exit_loop = True time.sleep(0.2) logger.debug("PIDs: %s. cudaApp PID: %d" % (str(pids), cudaApp.getpid())) assert cudaApp.getpid() in pids, "could not find cudaApp PID"
def test_dcgm_reader_default(handle): # pylint: disable=undefined-variable dr = DcgmReader() dr.SetHandle(handle) latest = dr.GetLatestGpuValuesAsFieldNameDict() for gpuId in latest: # latest data might be less than the list, because blank values aren't included # Defined in DcgmReader # pylint: disable=undefined-variable assert len(latest[gpuId]) <= len(defaultFieldIds) # Make sure we get strings for key in latest[gpuId]: assert isinstance(key, str) sample = dr.GetLatestGpuValuesAsFieldIdDict() for gpuId in sample: # Defined in DcgmReader # pylint: disable=undefined-variable assert len(sample[gpuId]) <= len(defaultFieldIds) # Make sure we get valid integer field ids for fieldId in sample[gpuId]: assert isinstance(fieldId, int) assert dcgm_fields.DcgmFieldGetById(fieldId) != None
def test_dcgm_reader_specific_fields(handle): specificFields = [ dcgm_fields.DCGM_FI_DEV_POWER_USAGE, dcgm_fields.DCGM_FI_DEV_XID_ERRORS ] dr = DcgmReader(fieldIds=specificFields) dr.SetHandle(handle) latest = dr.GetLatestGpuValuesAsFieldNameDict() for gpuId in latest: assert len(latest[gpuId]) <= len(specificFields)
def test_dcgm_prof_with_dcgmreader(handle, gpuIds): """ Verifies that we can access profiling data with DcgmReader, which is the base class for dcgm exporters """ dcgmHandle = pydcgm.DcgmHandle(handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) fieldIds = helper_get_single_pass_field_ids(dcgmGroup) updateFrequencyUsec=10000 sleepTime = 2 * (updateFrequencyUsec / 1000000.0) #Sleep 2x the update freq so we get new values each time dr = DcgmReader.DcgmReader(fieldIds=fieldIds, updateFrequency=updateFrequencyUsec, maxKeepAge=30.0, gpuIds=gpuIds) dr.SetHandle(handle) for i in range(10): time.sleep(sleepTime) latest = dr.GetLatestGpuValuesAsFieldIdDict() logger.info(str(latest)) for gpuId in gpuIds: assert len(latest[gpuId]) == len(fieldIds), "i=%d, gpuId %d, len %d != %d" % (i, gpuId, len(latest[gpuIds[i]]), len(fieldIds))
def test_reading_specific_data(handle, gpuIds): """ Verifies that we can inject specific data and get that same data back """ dcgmHandle = pydcgm.DcgmHandle(handle) dcgmSystem = dcgmHandle.GetSystem() specificFieldIds = [ dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION, dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION, ] fieldValues = [ 1, 1000, 9000, ] for i in range(0, len(specificFieldIds)): field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = specificFieldIds[i] field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() + 10) * 1000000.0) # set the injected data into the future field.value.i64 = fieldValues[i] ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field) assert (ret == dcgm_structs.DCGM_ST_OK) # pylint: disable=undefined-variable dr = DcgmReader(fieldIds=specificFieldIds) dr.SetHandle(handle) latest = dr.GetLatestGpuValuesAsFieldIdDict() assert len(latest[gpuIds[0]]) == len(specificFieldIds) for i in range(0, len(specificFieldIds)): assert latest[gpuIds[0]][specificFieldIds[i]] == fieldValues[i]
def util_dcgm_reader_all_since_last_call(handle, flag, repeat): """ Test to ensure GetAllValuesAsDictSinceLastCall behaves. It was first used for collectd integration to ensure it does not crash and also checks that no unexpected fields are returned. Arguments: handle: DCGM handle flag: argument for GetAllGpuValuesAsDictSinceLastCall repeat: whether to repeat GetAllGpuValuesAsDictsSinceLastCall call """ specificFields = [ dcgm_fields.DCGM_FI_DEV_POWER_USAGE, dcgm_fields.DCGM_FI_DEV_XID_ERRORS ] # pylint: disable=undefined-variable dr = DcgmReader(fieldIds=specificFields) dr.SetHandle(handle) latest = dr.GetAllGpuValuesAsDictSinceLastCall(flag) if repeat: latest = dr.GetAllGpuValuesAsDictSinceLastCall(flag) if flag == False: dcgmHandle = pydcgm.DcgmHandle(handle) dcgmSystem = dcgmHandle.GetSystem() fieldTags = [] for fieldId in specificFields: fieldTags.append(dcgmSystem.fields.GetFieldById(fieldId).tag) for gpuId in latest: # Latest data might be less than the list, because blank values aren't # included. We basically try to ensure there is no crash and we don't # return something absurd. assert len(latest[gpuId]) <= len(specificFields) for key in latest[gpuId].keys(): if flag == False: assert key in fieldTags else: assert key in specificFields
def DcgmReaderDictionary(field_ids=defaultFieldIds, update_frequency=10000000, keep_time=3600.0, ignores=[], field_groups='dcgm_fieldgroupdata'): # Instantiate a DcgmReader object dr = DcgmReader(fieldIds=field_ids, updateFrequency=update_frequency, maxKeepAge=keep_time, ignoreList=ignores, fieldGroupName=field_groups) # Get the default list of fields as a dictionary of dictionaries: # gpuId -> field name -> field value data = dr.GetLatestGpuValuesAsFieldNameDict() # Print the dictionary for gpuId in data: for fieldName in data[gpuId]: print("For gpu %s field %s=%s" % (str(gpuId), fieldName, data[gpuId][fieldName]))