def test_dcgm_engine_unwatch_field_value(handle): """ Verifies that the cache manager can unwatch a field value """ # Watch field so we can fetch it fieldId = dcgm_fields.DCGM_FI_DEV_NAME gpuId = 0 ret = dcgm_agent_internal.dcgmWatchFieldValue(handle, gpuId, fieldId, 10000000, 86400.0, 0) assert (ret == dcgm_structs.DCGM_ST_OK) fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fieldId) numWatchersBefore = fieldInfo.numWatchers # Unwatch field clearCache = 1 ret = dcgm_agent_internal.dcgmUnwatchFieldValue(handle, gpuId, fieldId, clearCache) assert (ret == dcgm_structs.DCGM_ST_OK) fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fieldId) numWatchersAfter = fieldInfo.numWatchers assert numWatchersAfter == numWatchersBefore - 1, "Expected 1 fewer watcher. Before %d. After %d" % ( numWatchersBefore, numWatchersAfter) assert (numWatchersAfter > 0) or ( 0 == (fieldInfo.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED )), "Expected no watch. got flags %08X" % fieldInfo.flags
def test_dcgm_engine_watch_field_values(handle): """ Verifies that cache manager can watch a field value """ # Watch field so we can fetch it fieldId = dcgm_fields.DCGM_FI_DEV_NAME gpuId = 0 try: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fieldId) numWatchersBefore = fieldInfo.numWatchers except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NOT_WATCHED) as e: numWatchersBefore = 0 ret = dcgm_agent_internal.dcgmWatchFieldValue(handle, gpuId, fieldId, 10000000, 86400.0, 0) assert (ret == dcgm_structs.DCGM_ST_OK) fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fieldId) assert fieldInfo.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED, "Expected watch. got flags %08X" % fieldInfo.flags numWatchersAfter = fieldInfo.numWatchers assert numWatchersAfter == numWatchersBefore + 1, "Expected 1 extra watcher. Before %d. After %d" % ( numWatchersBefore, numWatchersAfter)
def helper_unwatch_field_values_public(handle, gpuIds): """ Verifies that dcgm can unwatch a field value """ fieldId = dcgm_fields.DCGM_FI_DEV_NAME fieldIds = [ fieldId, ] handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetGroupWithGpuIds('mygroup', gpuIds) fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "myfieldgroup", fieldIds) updateFreq = 10000000 maxKeepAge = 86400 maxKeepSamples = 0 #These are all gpuId -> watcher count numWatchersBefore = {} numWatchersWithWatch = {} numWatchersAfter = {} #Get watch info before our test begins for gpuId in gpuIds: try: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handleObj.handle, gpuId, fieldId) numWatchersBefore[gpuId] = fieldInfo.numWatchers except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NOT_WATCHED) as e: numWatchersBefore[gpuId] = 0 #Now watch the fields groupObj.samples.WatchFields(fieldGroup, updateFreq, maxKeepAge, maxKeepSamples) #Get watcher info after our watch and check it against before our watch for gpuId in gpuIds: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handleObj.handle, gpuId, fieldId) numWatchersWithWatch[gpuId] = fieldInfo.numWatchers assert numWatchersWithWatch[gpuId] == numWatchersBefore[gpuId] + 1,\ "Watcher mismatch at gpuId %d, numWatchersWithWatch[gpuId] %d != numWatchersBefore[gpuId] %d + 1" %\ (gpuId, numWatchersWithWatch[gpuId], numWatchersBefore[gpuId]) #Unwatch fields groupObj.samples.UnwatchFields(fieldGroup) #Get watcher count after our unwatch. This should match our original watch count for gpuId in gpuIds: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handleObj.handle, gpuId, fieldId) numWatchersAfter[gpuId] = fieldInfo.numWatchers assert numWatchersBefore == numWatchersAfter, "Expected numWatchersBefore (%s) to match numWatchersAfter %s" %\ (str(numWatchersBefore), str(numWatchersAfter))
def test_dcgm_prof_all_supported_fields_watchable(handle, gpuId): ''' Verify that all fields that are reported as supported are watchable and that values can be returned for them ''' dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', [gpuId]) helper_check_profiling_environment(dcgmGroup) fieldIds = helper_get_supported_field_ids(dcgmGroup) assert fieldIds is not None watchFreq = 1000 #1 ms maxKeepAge = 60.0 maxKeepSamples = 0 maxAgeUsec = int(maxKeepAge) * watchFreq entityPairList = [dcgm_structs.c_dcgmGroupEntityPair_t(dcgm_fields.DCGM_FE_GPU, gpuId)] for fieldId in fieldIds: # If there are only one unsupported SKUs in the group, WatchFields should return an error. # If at least one GPU in the group is supported, WatchFields will be successful. # The described logic is used to skip unsupported or fake SKUs. if dcgmGroup.profiling.WatchFields([fieldId, ], watchFreq, maxKeepAge, maxKeepSamples) == dcgm_structs.DCGM_ST_PROFILING_NOT_SUPPORTED: test_utils.skip_test_supported("DCP") # Sending a request to the profiling manager guarantees that an update cycle has happened since # the last request dcgmGroup.profiling.GetSupportedMetricGroups() # validate watch freq, quota, and watched flags cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(handle, gpuId, fieldId) assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED) != 0, "gpuId %u, fieldId %u not watched" % (gpuId, fieldId) assert cmfi.numSamples > 0 assert cmfi.numWatchers == 1, "numWatchers %d" % cmfi.numWatchers assert cmfi.monitorFrequencyUsec == watchFreq, "monitorFrequencyUsec %u != watchFreq %u" % (cmfi.monitorFrequencyUsec, watchFreq) assert cmfi.lastStatus == dcgm_structs.DCGM_ST_OK, "lastStatus %u != DCGM_ST_OK" % (cmfi.lastStatus) fieldValues = dcgm_agent.dcgmEntitiesGetLatestValues(handle, entityPairList, [fieldId, ], 0) for i, fieldValue in enumerate(fieldValues): logger.debug(str(fieldValue)) assert(fieldValue.status == dcgm_structs.DCGM_ST_OK), "idx %d status was %d" % (i, fieldValue.status) assert(fieldValue.ts != 0), "idx %d timestamp was 0" % (i) dcgmGroup.profiling.UnwatchFields() #Validate watch flags after unwatch cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(handle, gpuId, fieldId) assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED) == 0, "gpuId %u, fieldId %u still watched. flags x%X" % (gpuId, fieldId, cmfi.flags) assert cmfi.numWatchers == 0, "numWatchers %d" % cmfi.numWatchers
def helper_promote_field_values_watch_public(handle, gpuIds): """ Verifies that dcgm can update a field value watch """ fieldId = dcgm_fields.DCGM_FI_DEV_NAME fieldIds = [ fieldId, ] handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetGroupWithGpuIds('mygroup', gpuIds) fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "myfieldgroup", fieldIds) updateFreq = 100000 #100 msec maxKeepAge = 3600 maxKeepSamples = 0 #Track the number of watchers to make sure our watch promotion doesn't create another sub-watch #but rather updates the existing one numWatchersWithWatch = {} numWatchersAfter = {} #Watch the fields groupObj.samples.WatchFields(fieldGroup, updateFreq, maxKeepAge, maxKeepSamples) #Get watcher info after our watch and verify that the updateFrequency matches for gpuId in gpuIds: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handleObj.handle, gpuId, fieldId) numWatchersWithWatch[gpuId] = fieldInfo.numWatchers assert fieldInfo.monitorFrequencyUsec == updateFreq, "after watch: fieldInfo.monitorFrequencyUsec %d != updateFreq %d" % \ (fieldInfo.monitorFrequencyUsec, updateFreq) #Update the watch with a faster update frequency updateFreq = 50000 #50 msec groupObj.samples.WatchFields(fieldGroup, updateFreq, maxKeepAge, maxKeepSamples) #Get watcher info after our second watch and verify that the updateFrequency matches for gpuId in gpuIds: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handleObj.handle, gpuId, fieldId) numWatchersAfter[gpuId] = fieldInfo.numWatchers assert fieldInfo.monitorFrequencyUsec == updateFreq, "after watch: fieldInfo.monitorFrequencyUsec %d != updateFreq %d" % \ (fieldInfo.monitorFrequencyUsec, updateFreq) assert numWatchersWithWatch == numWatchersAfter, "numWatchersWithWatch (%s) != numWatchersAfter (%s)" % \ (str(numWatchersWithWatch), str(numWatchersAfter))
totalSampleCount = 0 cycleCount = 0 while True: cycleCount += 1 print("Cycle %d. Fields that updated in the last 60 seconds" % cycleCount) driverTimeByFieldId = {} watchIntervalByFieldId = {} for gpuId in gpuIds: for fieldId in fieldIds: watchInfo = None try: watchInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(dcgmHandle.handle, gpuId, fieldId) except: pass if watchInfo is None: continue nowTs = int(time.time() * 1000000) oneMinuteAgoTs = nowTs - 60000000 if watchInfo.newestTimestamp < oneMinuteAgoTs: continue perUpdate = 0 if watchInfo.fetchCount > 0: perUpdate = watchInfo.execTimeUsec / watchInfo.fetchCount
def test_dcgm_injection_agent(handle, gpuIds): """ Verifies that injection works with the agent host engine """ gpuId = gpuIds[0] #Make a base value that is good for starters fvGood = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() fvGood.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 fvGood.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_CURRENT fvGood.status = 0 fvGood.fieldType = ord(dcgm_fields.DCGM_FT_INT64) fvGood.ts = get_usec_since_1970() fvGood.value.i64 = 1 fieldInfoBefore = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fvGood.fieldId) countBefore = fieldInfoBefore.numSamples #This will throw an exception if it fails dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvGood) fieldInfoAfter = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fvGood.fieldId) countAfter = fieldInfoAfter.numSamples assert countAfter > countBefore, "Expected countAfter %d > countBefore %d after injection" % ( countAfter, countBefore) #Fetch the value we just inserted and verify its attributes are the same fvFetched = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, gpuId, [ fvGood.fieldId, ])[0] helper_verify_fv_equal(fvFetched, fvGood) #Should be able to insert a null timestamp. The agent will just use "now" fvAlsoGood = fvGood fvAlsoGood.ts = 0 #This will thrown an exception if it fails dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvAlsoGood) #Now make some attributes bad and expect an error fvBad = fvGood fvBad.fieldType = ord(dcgm_fields.DCGM_FT_DOUBLE) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)): dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad) fvGood.fieldType = ord(dcgm_fields.DCGM_FT_INT64) """ TODO: DCGM-2130 - Restore this test when protobuf is removed #Now make some attributes bad and expect an error fvBad = fvGood fvBad.version = 0 with test_utils.assert_raises(dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad) fvGood.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 """ fvBad = fvGood fvBad.fieldId = dcgm_fields.DCGM_FI_MAX_FIELDS + 100 with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)): dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad)
def test_dcgm_prof_all_supported_fields_watchable(handle, gpuIds): ''' Verify that all fields that are reported as supported are watchable and that values can be returned for them ''' dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) fieldIds = helper_get_supported_field_ids(dcgmGroup) assert fieldIds is not None watchFreq = 1000 #1 ms maxKeepAge = 60.0 maxKeepSamples = 0 maxAgeUsec = int(maxKeepAge) * watchFreq entityPairList = [] for gpuId in gpuIds: entityPairList.append( dcgm_structs.c_dcgmGroupEntityPair_t(dcgm_fields.DCGM_FE_GPU, gpuId)) for fieldId in fieldIds: dcgmGroup.profiling.WatchFields([ fieldId, ], watchFreq, maxKeepAge, maxKeepSamples) # Sending a request to the profiling manager guarantees that an update cycle has happened since # the last request dcgmGroup.profiling.GetSupportedMetricGroups() # validate watch freq, quota, and watched flags for gpuId in gpuIds: cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fieldId) assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED ) != 0, "gpuId %u, fieldId %u not watched" % (gpuId, fieldId) assert cmfi.numSamples > 0 assert cmfi.numWatchers == 1, "numWatchers %d" % cmfi.numWatchers assert cmfi.monitorFrequencyUsec == watchFreq, "monitorFrequencyUsec %u != watchFreq %u" % ( cmfi.monitorFrequencyUsec, watchFreq) assert cmfi.lastStatus == dcgm_structs.DCGM_ST_OK, "lastStatus %u != DCGM_ST_OK" % ( cmfi.lastStatus) fieldValues = dcgm_agent.dcgmEntitiesGetLatestValues( handle, entityPairList, [ fieldId, ], 0) for i, fieldValue in enumerate(fieldValues): logger.debug(str(fieldValue)) assert (fieldValue.status == dcgm_structs.DCGM_ST_OK ), "idx %d status was %d" % (i, fieldValue.status) assert (fieldValue.ts != 0), "idx %d timestamp was 0" % (i) dcgmGroup.profiling.UnwatchFields() #Validate watch flags after unwatch for gpuId in gpuIds: cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fieldId) assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED ) == 0, "gpuId %u, fieldId %u still watched. flags x%X" % ( gpuId, fieldId, cmfi.flags) assert cmfi.numWatchers == 0, "numWatchers %d" % cmfi.numWatchers