def _InitHandles(self): self._dcgmHandle = pydcgm.DcgmHandle(ipAddress=self._hostname) groupName = "error_mon_gpus" + self._pidPostfix self._allGpusGroup = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName, groupType=dcgm_structs.DCGM_GROUP_DEFAULT) print("Found %d GPUs" % (len(self._allGpusGroup.GetEntities()))) groupName = "error_mon_nvswitches" + self._pidPostfix self._allNvSwitchesGroup = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName, groupType=dcgm_structs.DCGM_GROUP_DEFAULT_NVSWITCHES) print("Found %d NvSwitches" % len(self._allNvSwitchesGroup.GetEntities())) fgName = "error_mon_nvswitches" + self._pidPostfix self._nvSwitchErrorFieldGroup = pydcgm.DcgmFieldGroup(self._dcgmHandle, name=fgName, fieldIds=self._nvSwitchErrorFieldIds) fgName = "error_mon_gpus" + self._pidPostfix self._gpuErrorFieldGroup = pydcgm.DcgmFieldGroup(self._dcgmHandle, name=fgName, fieldIds=self._gpuErrorFieldIds) updateFreq = int(self._updateIntervalSecs / 2.0) * 1000000 maxKeepAge = 3600.0 #1 hour maxKeepSamples = 0 #Rely on maxKeepAge self._nvSwitchWatcher = dcgm_field_helpers.DcgmFieldGroupEntityWatcher( self._dcgmHandle.handle, self._allNvSwitchesGroup.GetId(), self._nvSwitchErrorFieldGroup, dcgm_structs.DCGM_OPERATION_MODE_AUTO, updateFreq, maxKeepAge, maxKeepSamples, 0) self._gpuWatcher = dcgm_field_helpers.DcgmFieldGroupEntityWatcher( self._dcgmHandle.handle, self._allGpusGroup.GetId(), self._gpuErrorFieldGroup, dcgm_structs.DCGM_OPERATION_MODE_AUTO, updateFreq, maxKeepAge, maxKeepSamples, 0)
def GetFieldMetadata(self): self.m_fieldIdToInfo = {} findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName( self.m_fieldGroupName) #Remove our field group if it exists already if findByNameId is not None: delFieldGroup = pydcgm.DcgmFieldGroup(dcgmHandle=self.m_dcgmHandle, fieldGroupId=findByNameId) delFieldGroup.Delete() del (delFieldGroup) self.m_fieldGroup = pydcgm.DcgmFieldGroup(self.m_dcgmHandle, self.m_fieldGroupName, self.m_publishFieldIds) for fieldId in self.m_fieldGroup.fieldIds: self.m_fieldIdToInfo[ fieldId] = self.m_dcgmSystem.fields.GetFieldById(fieldId) if self.m_fieldIdToInfo[fieldId] == 0 or self.m_fieldIdToInfo[ fieldId] == None: self.LogError( "Cannot get field tag for field id %d. Please check dcgm_fields to see if it is valid." % (fieldId)) raise dcgm_structs.DCGMError( dcgm_structs.DCGM_ST_UNKNOWN_FIELD)
def GetFieldMetadata(self): self.m_fieldIdToInfo = {} self.m_fieldGroups = {} self.m_fieldGroup = None allFieldIds = [] # Initialize groups for all field intervals. self.LogInfo("GetFieldMetaData:\n") intervalIndex = 0 for interval, fieldIds in self.m_publishFields.items(): self.LogInfo("sampling interval = " + str(interval) + ":\n") for fieldId in fieldIds: self.LogInfo(" fieldId: " + str(fieldId) + "\n") intervalIndex += 1 fieldGroupName = self.m_fieldGroupName + "_" + str(intervalIndex) findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName( fieldGroupName) self.LogInfo("fieldGroupName: " + fieldGroupName + "\n") # Remove our field group if it exists already if findByNameId is not None: self.LogInfo("fieldGroupId: " + findByNameId + "\n") delFieldGroup = pydcgm.DcgmFieldGroup( dcgmHandle=self.m_dcgmHandle, fieldGroupId=findByNameId) delFieldGroup.Delete() del (delFieldGroup) self.m_fieldGroups[interval] = pydcgm.DcgmFieldGroup( self.m_dcgmHandle, fieldGroupName, fieldIds) for fieldId in fieldIds: if fieldId not in allFieldIds: allFieldIds += [fieldId] self.m_fieldIdToInfo[ fieldId] = self.m_dcgmSystem.fields.GetFieldById(fieldId) if self.m_fieldIdToInfo[fieldId] == 0 or self.m_fieldIdToInfo[ fieldId] == None: self.LogError( "Cannot get field tag for field id %d. Please check dcgm_fields to see if it is valid." % (fieldId)) raise dcgm_structs.DCGMError( dcgm_structs.DCGM_ST_UNKNOWN_FIELD) # Initialize a field group of ALL fields. fieldGroupName = self.m_fieldGroupName findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName(fieldGroupName) # Remove our field group if it exists already if findByNameId is not None: delFieldGroup = pydcgm.DcgmFieldGroup(dcgmHandle=self.m_dcgmHandle, fieldGroupId=findByNameId) delFieldGroup.Delete() del (delFieldGroup) self.m_fieldGroup = pydcgm.DcgmFieldGroup(self.m_dcgmHandle, fieldGroupName, allFieldIds)
def test_dcgm_field_group_get_by_name(handle): fieldIds = [dcgm_fields.DCGM_FI_DRIVER_VERSION, dcgm_fields.DCGM_FI_DEV_NAME, dcgm_fields.DCGM_FI_DEV_BRAND] handle = pydcgm.DcgmHandle(handle) fieldGroupName = "mygroup" fieldGroupObj = pydcgm.DcgmFieldGroup(handle, "mygroup", fieldIds) findByNameId = handle.GetSystem().GetFieldGroupIdByName(fieldGroupName) assert findByNameId is not None, "Expected field group ID. Got None" assert int(findByNameId.value) == int(fieldGroupObj.fieldGroupId.value), "Got field group ID handle mismatch %s != %s" % (findByNameId, fieldGroupObj.fieldGroupId) #Make sure we can create an object from our found id and delete it fieldGroupObj2 = pydcgm.DcgmFieldGroup(dcgmHandle=handle, fieldGroupId=findByNameId) fieldGroupObj2.Delete()
def test_dcgm_embedded_metadata_exectime_get_field_group_sane(handle): """ Sanity test for API that gets execution time of all fields together """ if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) fieldIds = [ dcgm_fields.DCGM_FI_DEV_POWER_USAGE, dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_GPU_TEMP ] fieldGroup = pydcgm.DcgmFieldGroup(handle, "my_field_group", fieldIds) updateFreqUsec = 1000 _watch_field_group_basic(fieldGroup, handle.handle, group.GetId(), updateFreq=updateFreqUsec) system.introspect.UpdateAll() execTime = system.introspect.execTime.GetForFieldGroup( fieldGroup).aggregateInfo # test that all struct fields in the API response have reasonable values assert (100 < execTime.totalEverUpdateUsec < 100 * 1000), execTime.totalEverUpdateUsec assert (100 < execTime.recentUpdateUsec < 100 * 1000), execTime.recentUpdateUsec assert (updateFreqUsec == execTime.meanUpdateFreqUsec ), execTime.meanUpdateFreqUsec
def test_dcgm_embedded_metadata_memory_get_field_sane(handle): ''' Sanity test for API that gets memory usage of a single field ''' if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handleObj = pydcgm.DcgmHandle(handle=handle) fieldIds = [ dcgm_fields.DCGM_FI_DEV_GPU_TEMP, ] fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds) group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle), groupName="test-metadata", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle)) _watch_field_group_basic(fieldGroup, handle, group.GetId()) system.introspect.UpdateAll() memoryInfo = dcgm_agent_internal.dcgmIntrospectGetFieldMemoryUsage( handle, fieldIds[0]) logger.debug("field %s using %.2f KB" % (fieldIds[0], memoryInfo.aggregateInfo.bytesUsed / 1024.)) # 0+ to 200 KB assert(0 < memoryInfo.aggregateInfo.bytesUsed < 1024*200), \ 'bytes used to store field was unreasonable for ID %s, bytes: %s' \ % (fieldIds[0], memoryInfo.aggregateInfo.bytesUsed)
def test_dcgm_embedded_metadata_memory_get_field_group_sane(handle): ''' Sanity test for API that gets memory usage of a single field group ''' if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) group = pydcgm.DcgmGroup(handle, groupName='test-metadata', groupType=dcgm_structs.DCGM_GROUP_DEFAULT) system = pydcgm.DcgmSystem(handle) fieldIds = [ dcgm_fields.DCGM_FI_DEV_GPU_TEMP, dcgm_fields.DCGM_FI_DEV_POWER_USAGE ] fieldGroup = pydcgm.DcgmFieldGroup(handle, "my_field_group", fieldIds) # ensure that the field group is watched _watch_field_group_basic(fieldGroup, handle.handle, group.GetId()) system.introspect.UpdateAll() memoryInfo = system.introspect.memory.GetForFieldGroup(fieldGroup) logger.debug("field group %s is using %.2f KB" % (fieldGroup.name, memoryInfo.aggregateInfo.bytesUsed / 1024.)) # 0+ to 20 MB assert(0 < memoryInfo.aggregateInfo.bytesUsed < 1024*1024*20), \ 'bytes used to store field was unreasonable for field group %s, bytes: %s' \ % (fieldGroup.name, memoryInfo.aggregateInfo.bytesUsed)
def helper_unwatch_field_values_public(handle, gpuIds): """ Verifies that dcgm can unwatch a field value """ fieldId = dcgm_fields.DCGM_FI_DEV_NAME fieldIds = [ fieldId, ] handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetGroupWithGpuIds('mygroup', gpuIds) fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "myfieldgroup", fieldIds) updateFreq = 10000000 maxKeepAge = 86400 maxKeepSamples = 0 #These are all gpuId -> watcher count numWatchersBefore = {} numWatchersWithWatch = {} numWatchersAfter = {} #Get watch info before our test begins for gpuId in gpuIds: try: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handleObj.handle, gpuId, fieldId) numWatchersBefore[gpuId] = fieldInfo.numWatchers except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NOT_WATCHED) as e: numWatchersBefore[gpuId] = 0 #Now watch the fields groupObj.samples.WatchFields(fieldGroup, updateFreq, maxKeepAge, maxKeepSamples) #Get watcher info after our watch and check it against before our watch for gpuId in gpuIds: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handleObj.handle, gpuId, fieldId) numWatchersWithWatch[gpuId] = fieldInfo.numWatchers assert numWatchersWithWatch[gpuId] == numWatchersBefore[gpuId] + 1,\ "Watcher mismatch at gpuId %d, numWatchersWithWatch[gpuId] %d != numWatchersBefore[gpuId] %d + 1" %\ (gpuId, numWatchersWithWatch[gpuId], numWatchersBefore[gpuId]) #Unwatch fields groupObj.samples.UnwatchFields(fieldGroup) #Get watcher count after our unwatch. This should match our original watch count for gpuId in gpuIds: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handleObj.handle, gpuId, fieldId) numWatchersAfter[gpuId] = fieldInfo.numWatchers assert numWatchersBefore == numWatchersAfter, "Expected numWatchersBefore (%s) to match numWatchersAfter %s" %\ (str(numWatchersBefore), str(numWatchersAfter))
def test_dcgm_field_group_info(handle): fieldIds = [dcgm_fields.DCGM_FI_DRIVER_VERSION, dcgm_fields.DCGM_FI_DEV_NAME, dcgm_fields.DCGM_FI_DEV_BRAND] handle = pydcgm.DcgmHandle(handle) fieldGroup = pydcgm.DcgmFieldGroup(handle, "mygroup", fieldIds) #Get the field group we just added to verify it was added and the metadata is correct fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(handle.handle, fieldGroup.fieldGroupId) assert fieldGroupInfo.version == dcgm_structs.dcgmFieldGroupInfo_version1, fieldGroupInfo.version assert fieldGroupInfo.fieldGroupId == int(fieldGroup.fieldGroupId.value), "%s != %s" %(str(fieldGroupInfo.fieldGroupId), str(fieldGroup.fieldGroupId)) assert fieldGroupInfo.fieldGroupName == fieldGroup.name, str(fieldGroupInfo.name) assert fieldGroupInfo.numFieldIds == len(fieldIds), fieldGroupInfo.numFieldIds for i, fieldId in enumerate(fieldIds): assert fieldGroupInfo.fieldIds[i] == fieldId, "i = %d, %d != %d" % (i, fieldGroupInfo.fieldIds[i], fieldId)
def helper_promote_field_values_watch_public(handle, gpuIds): """ Verifies that dcgm can update a field value watch """ fieldId = dcgm_fields.DCGM_FI_DEV_NAME fieldIds = [ fieldId, ] handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetGroupWithGpuIds('mygroup', gpuIds) fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "myfieldgroup", fieldIds) updateFreq = 100000 #100 msec maxKeepAge = 3600 maxKeepSamples = 0 #Track the number of watchers to make sure our watch promotion doesn't create another sub-watch #but rather updates the existing one numWatchersWithWatch = {} numWatchersAfter = {} #Watch the fields groupObj.samples.WatchFields(fieldGroup, updateFreq, maxKeepAge, maxKeepSamples) #Get watcher info after our watch and verify that the updateFrequency matches for gpuId in gpuIds: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handleObj.handle, gpuId, fieldId) numWatchersWithWatch[gpuId] = fieldInfo.numWatchers assert fieldInfo.monitorFrequencyUsec == updateFreq, "after watch: fieldInfo.monitorFrequencyUsec %d != updateFreq %d" % \ (fieldInfo.monitorFrequencyUsec, updateFreq) #Update the watch with a faster update frequency updateFreq = 50000 #50 msec groupObj.samples.WatchFields(fieldGroup, updateFreq, maxKeepAge, maxKeepSamples) #Get watcher info after our second watch and verify that the updateFrequency matches for gpuId in gpuIds: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handleObj.handle, gpuId, fieldId) numWatchersAfter[gpuId] = fieldInfo.numWatchers assert fieldInfo.monitorFrequencyUsec == updateFreq, "after watch: fieldInfo.monitorFrequencyUsec %d != updateFreq %d" % \ (fieldInfo.monitorFrequencyUsec, updateFreq) assert numWatchersWithWatch == numWatchersAfter, "numWatchersWithWatch (%s) != numWatchersAfter (%s)" % \ (str(numWatchersWithWatch), str(numWatchersAfter))
def main(): operationMode = dcgm_structs.DCGM_OPERATION_MODE_AUTO timeStep = 1.0 dcgm_structs._dcgmInit() dcgm_agent.dcgmInit() #Will throw an exception on error handle = dcgm_agent.dcgmStartEmbedded(operationMode) handleObj = pydcgm.DcgmHandle(handle=handle) groupId = dcgm_structs.DCGM_GROUP_ALL_GPUS fieldIds = [ dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEM_CLOCK ] fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds) updateFreq = int(timeStep * 1000000.0) maxKeepAge = 3600.0 #1 hour maxKeepSamples = 0 #unlimited. maxKeepAge will enforce quota startTimestamp = 0 #beginning of time dfcw = DcgmFieldGroupWatcher(handle, groupId, fieldGroup, operationMode, updateFreq, maxKeepAge, maxKeepSamples, startTimestamp) dfcw2 = DcgmFieldGroupEntityWatcher(handle, groupId, fieldGroup, operationMode, updateFreq, maxKeepAge, maxKeepSamples, startTimestamp) while (True): newUpdateCount = dfcw.GetAllSinceLastCall() newUpdateCount2 = dfcw2.GetAllSinceLastCall() print("Got %d and %d new field value updates" % (newUpdateCount, newUpdateCount2)) for gpuId in list(dfcw.values.keys()): print("gpuId %d" % gpuId) for fieldId in list(dfcw.values[gpuId].keys()): print(" fieldId %d: %d values. latest timestamp %d" % \ (fieldId, len(dfcw.values[gpuId][fieldId]), dfcw.values[gpuId][fieldId][-1].ts)) for entityGroupId in list(dfcw2.values.keys()): print("entityGroupId %d" % entityGroupId) for entityId in list(dfcw2.values[entityGroupId].keys()): print(" entityId %d" % entityId) for fieldId in list( dfcw2.values[entityGroupId][entityId].keys()): print(" fieldId %d: %d values. latest timestamp %d" % \ (fieldId, len(dfcw2.values[entityGroupId][entityId][fieldId]), dfcw2.values[entityGroupId][entityId][fieldId][-1].ts)) time.sleep(timeStep)
def test_dcgm_field_group_add_remove(handle): fieldIds = [dcgm_fields.DCGM_FI_DRIVER_VERSION, dcgm_fields.DCGM_FI_DEV_NAME, dcgm_fields.DCGM_FI_DEV_BRAND] handle = pydcgm.DcgmHandle(handle) fieldGroup = pydcgm.DcgmFieldGroup(handle, "mygroup", fieldIds) #Save this ID before we mess with the object fieldGroupId = fieldGroup.fieldGroupId #This will assert on error fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(handle.handle, fieldGroupId) #Delete the field group and make sure it's gone from the host engine del(fieldGroup) fieldGroup = None with test_utils.assert_raises(dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA)): fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(handle.handle, fieldGroupId)
def test_dcgm_embedded_metadata_mean_update_frequency(handle): """ Ensure that mean update frequency is being calculated properly """ handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # these frequencies must have a perfect integer mean or the last assertion will fail updateFreqs = { dcgm_fields.DCGM_FI_DEV_POWER_USAGE: 10000, dcgm_fields.DCGM_FI_DEV_GPU_TEMP: 20000, } meanUpdateFreq = stats.mean(updateFreqs.values()) gpuId = group.GetGpuIds()[0] fieldIds = [] for fieldId, freqUsec in updateFreqs.items(): fieldIds.append(fieldId) dcgm_agent_internal.dcgmWatchFieldValue(handle.handle, gpuId, fieldId, freqUsec, 100000, 10) system.UpdateAllFields(True) system.introspect.UpdateAll() fieldGroup = pydcgm.DcgmFieldGroup(handle, "my_field_group", fieldIds) execTime = system.introspect.execTime.GetForFieldGroup(fieldGroup) resultGpuIndex = -1 for i in range(execTime.gpuInfoCount): if execTime.gpuIdsForGpuInfo[i] == gpuId: resultGpuIndex = i break assert(resultGpuIndex >= 0), "no results returned for the watched GPU" actualMeanUpdateFreq = execTime.gpuInfo[resultGpuIndex].meanUpdateFreqUsec assert(actualMeanUpdateFreq == meanUpdateFreq), "expected %s, got %s" \ % (meanUpdateFreq, actualMeanUpdateFreq)
def __init__(self, dcgmHandle, gpuIds, fieldIds, watchIntervalSecs): global nameIncrement self._dcgmHandle = dcgmHandle self._dcgmSystem = dcgmHandle.GetSystem() gpuGroupName = "%d_%d" % (os.getpid(), nameIncrement) nameIncrement += 1 if gpuIds is None: self._dcgmGroup = self._dcgmSystem.GetDefaultGroup() else: self._dcgmGroup = self._dcgmSystem.GetGroupWithGpuIds( gpuGroupName, gpuIds) self._watchIntervalSecs = watchIntervalSecs fieldGroupName = "%d_%d" % (os.getpid(), nameIncrement) nameIncrement += 1 self._dcgmFieldGroup = pydcgm.DcgmFieldGroup(dcgmHandle, fieldGroupName, fieldIds, None)
def test_dcgm_field_group_get_info_validate(handle): """ Validates structure version """ fieldIds = [ dcgm_fields.DCGM_FI_DRIVER_VERSION, dcgm_fields.DCGM_FI_DEV_NAME, dcgm_fields.DCGM_FI_DEV_BRAND ] handle = pydcgm.DcgmHandle(handle) fieldGroup = pydcgm.DcgmFieldGroup(handle, "mygroup", fieldIds) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmFieldGroupGetInfo(handle.handle, fieldGroup.fieldGroupId, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmFieldGroupGetInfo(handle.handle, fieldGroup.fieldGroupId, versionTest)
def test_dcgm_field_group_duplicate_name(handle): fieldIds = [dcgm_fields.DCGM_FI_DRIVER_VERSION, ] handle = pydcgm.DcgmHandle(handle) fieldGroup = pydcgm.DcgmFieldGroup(handle, "dupeme", fieldIds) with test_utils.assert_raises(dcgmExceptionClass(dcgm_structs.DCGM_ST_DUPLICATE_KEY)): fieldGroup2 = pydcgm.DcgmFieldGroup(handle, "dupeme", fieldIds)
def test_nvswitch_traffic_p2p(handle, switchIds): """ Verifies that fabric can pass p2p read and write traffic successfully """ test_utils.skip_test("Bandwidth field not being updated yet") # TX_0 and RX_0 on port 0 nvSwitchBandwidth0FieldIds = [] for i in range(dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00, dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 + 1, 1): nvSwitchBandwidth0FieldIds.append(i) # TX_1 and RX_1 on port 0 nvSwitchBandwidth1FieldIds = [] for i in range(dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00, dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 + 1, 1): nvSwitchBandwidth1FieldIds.append(i) dcgmHandle = pydcgm.DcgmHandle(ipAddress="127.0.0.1") groupName = "test_nvswitches" allNvSwitchesGroup = pydcgm.DcgmGroup( dcgmHandle, groupName=groupName, groupType=dcgm_structs.DCGM_GROUP_DEFAULT_NVSWITCHES) fgName = "test_nvswitches_bandwidth0" nvSwitchBandwidth0FieldGroup = pydcgm.DcgmFieldGroup( dcgmHandle, name=fgName, fieldIds=nvSwitchBandwidth0FieldIds) fgName = "test_nvswitches_bandwidth1" nvSwitchBandwidth1FieldGroup = pydcgm.DcgmFieldGroup( dcgmHandle, name=fgName, fieldIds=nvSwitchBandwidth1FieldIds) updateFreq = int(20 / 2.0) * 1000000 maxKeepAge = 600.0 maxKeepSamples = 0 nvSwitchBandwidth0Watcher = dcgm_field_helpers.DcgmFieldGroupEntityWatcher( dcgmHandle.handle, allNvSwitchesGroup.GetId(), nvSwitchBandwidth0FieldGroup, dcgm_structs.DCGM_OPERATION_MODE_AUTO, updateFreq, maxKeepAge, maxKeepSamples, 0) nvSwitchBandwidth1Watcher = dcgm_field_helpers.DcgmFieldGroupEntityWatcher( dcgmHandle.handle, allNvSwitchesGroup.GetId(), nvSwitchBandwidth1FieldGroup, dcgm_structs.DCGM_OPERATION_MODE_AUTO, updateFreq, maxKeepAge, maxKeepSamples, 0) # wait for FM reports and populates stats time.sleep(30) # read the counters before sending traffic nvSwitchBandwidth0Watcher.GetMore() nvSwitchBandwidth1Watcher.GetMore() for entityGroupId in nvSwitchBandwidth0Watcher.values.keys(): for entityId in nvSwitchBandwidth0Watcher.values[entityGroupId]: bandwidth0FieldId = dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 bandwidth1FieldId = dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 counter0TxBefore = nvSwitchBandwidth0Watcher.values[entityGroupId][ entityId][bandwidth0FieldId].values[-1].value bandwidth0FieldId += 1 counter0RxBefore = nvSwitchBandwidth0Watcher.values[entityGroupId][ entityId][bandwidth0FieldId].values[-1].value counter1TxBefore = nvSwitchBandwidth1Watcher.values[entityGroupId][ entityId][bandwidth1FieldId].values[-1].value bandwidth1FieldId += 1 counter1RxBefore = nvSwitchBandwidth1Watcher.values[entityGroupId][ entityId][bandwidth1FieldId].values[-1].value # Generate write traffic for the nvswitches test_utils.run_p2p_bandwidth_app( test_nvswitch_utils.MEMCPY_DTOD_WRITE_CE_BANDWIDTH) # Generate read traffic for the nvswitches test_utils.run_p2p_bandwidth_app( test_nvswitch_utils.MEMCPY_DTOD_READ_CE_BANDWIDTH) # read the counters again after sending traffic nvSwitchBandwidth0Watcher.GetMore() nvSwitchBandwidth1Watcher.GetMore() for entityGroupId in nvSwitchBandwidth0Watcher.values.keys(): for entityId in nvSwitchBandwidth0Watcher.values[entityGroupId]: bandwidth0FieldId = dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 bandwidth1FieldId = dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 counter0TxAfter = nvSwitchBandwidth0Watcher.values[entityGroupId][ entityId][bandwidth0FieldId].values[-1].value bandwidth0FieldId += 1 counter0RxAfter = nvSwitchBandwidth0Watcher.values[entityGroupId][ entityId][bandwidth0FieldId].values[-1].value counter1TxAfter = nvSwitchBandwidth1Watcher.values[entityGroupId][ entityId][bandwidth1FieldId].values[-1].value bandwidth1FieldId += 1 counter1RxAfter = nvSwitchBandwidth1Watcher.values[entityGroupId][ entityId][bandwidth1FieldId].values[-1].value assert counter0TxAfter > counter0TxBefore, "Counter0Tx did not increase" assert counter0RxAfter > counter0RxBefore, "counter0Rx did not increase" assert counter1TxAfter > counter1TxBefore, "Counter1Tx did not increase" assert counter1RxAfter > counter1RxBefore, "counter1Rx did not increase"
def _gather_perf_timeseries(handle, watchedFieldIds): ''' Gathers metadata over time and returns a tuple of 4 MetadataTimeseries (mem usage, exec time, avg exec time, cpu utilization) ''' system = pydcgm.DcgmSystem(handle) memUsageTS = MetadataTimeseries() execTimeTS = MetadataTimeseries() execTimeAvgTS = MetadataTimeseries() cpuUtilTS = CpuTimeseries() numFields = min(len(watchedFieldIds), 50) fieldGroups = [] for i in range(1, 6): fieldGroups.append( pydcgm.DcgmFieldGroup(handle, "my_field_group_%d" % i, list(watchedFieldIds)[0:numFields])) startTime = datetime.datetime.now() while (datetime.datetime.now() - startTime).total_seconds() < BOUNDED_TEST_DURATION: # poll memory usage memUsageTS.timestamps.append( (datetime.datetime.now() - startTime).total_seconds()) memUsageTS.processVals.append( system.introspect.memory.GetForHostengine().bytesUsed) memUsageTS.allFieldsVals.append( system.introspect.memory.GetForAllFields().aggregateInfo.bytesUsed) for id in watchedFieldIds: memUsageTS.fieldVals[id].append( dcgm_agent_internal.dcgmIntrospectGetFieldMemoryUsage( handle.handle, id).aggregateInfo.bytesUsed) for fieldGroup in fieldGroups: memUsageTS.fieldGroupVals[int( fieldGroup.fieldGroupId.value)].append( system.introspect.memory.GetForFieldGroup( fieldGroup).aggregateInfo.bytesUsed) # poll execution time execTimeTS.timestamps.append( (datetime.datetime.now() - startTime).total_seconds()) execTimeTS.allFieldsVals.append( system.introspect.execTime.GetForAllFields( ).aggregateInfo.totalEverUpdateUsec) for id in watchedFieldIds: execTimeTS.fieldVals[id].append( dcgm_agent_internal.dcgmIntrospectGetFieldExecTime( handle.handle, id).aggregateInfo.totalEverUpdateUsec) #logger.info("fieldId %d: %s" % (id, str(execTimeTS.fieldVals[id][-1]))) for fieldGroup in fieldGroups: execTimeTS.fieldGroupVals[int( fieldGroup.fieldGroupId.value)].append( system.introspect.execTime.GetForFieldGroup( fieldGroup).aggregateInfo.totalEverUpdateUsec) # poll average execution time execTimeAvgTS.timestamps.append( (datetime.datetime.now() - startTime).total_seconds()) execTimeAvgTS.allFieldsVals.append( system.introspect.execTime.GetForAllFields( ).aggregateInfo.recentUpdateUsec) for id in watchedFieldIds: execTimeAvgTS.fieldVals[id].append( dcgm_agent_internal.dcgmIntrospectGetFieldExecTime( handle.handle, id).aggregateInfo.recentUpdateUsec) for fieldGroup in fieldGroups: execTimeAvgTS.fieldGroupVals[int( fieldGroup.fieldGroupId.value)].append( system.introspect.execTime.GetForFieldGroup( fieldGroup).aggregateInfo.recentUpdateUsec) # poll cpu utilization cpuUtilTS.timestamps.append( (datetime.datetime.now() - startTime).total_seconds()) cpuUtilTS.cpuInfo.append(system.introspect.cpuUtil.GetForHostengine()) time.sleep(0.050) return memUsageTS, execTimeTS, execTimeAvgTS, cpuUtilTS