def test_dcgm_reader_default(handle): # pylint: disable=undefined-variable dr = DcgmReader() dr.SetHandle(handle) latest = dr.GetLatestGpuValuesAsFieldNameDict() for gpuId in latest: # latest data might be less than the list, because blank values aren't included # Defined in DcgmReader # pylint: disable=undefined-variable assert len(latest[gpuId]) <= len(defaultFieldIds) # Make sure we get strings for key in latest[gpuId]: assert isinstance(key, str) sample = dr.GetLatestGpuValuesAsFieldIdDict() for gpuId in sample: # Defined in DcgmReader # pylint: disable=undefined-variable assert len(sample[gpuId]) <= len(defaultFieldIds) # Make sure we get valid integer field ids for fieldId in sample[gpuId]: assert isinstance(fieldId, int) assert dcgm_fields.DcgmFieldGetById(fieldId) != None
def GetFieldById(self, fieldId): ''' Get a field's metadata by its dcgm_fields.DCGM_FI_* field ID fieldId: dcgm_fields.DCGM_FI_* field ID of the field Returns a dcgm_fields.c_dcgm_field_meta_t struct on success or None on error. ''' return dcgm_fields.DcgmFieldGetById(fieldId)
def helper_field_has_variable_size(fieldId): ''' Returns True if a field has a variable memory size per record. False if it doesn't. ''' if fieldId == dcgm_fields.DCGM_FI_DEV_GPU_UTIL_SAMPLES or \ fieldId == dcgm_fields.DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES or \ fieldId == dcgm_fields.DCGM_FI_DEV_GRAPHICS_PIDS or \ fieldId == dcgm_fields.DCGM_FI_DEV_COMPUTE_PIDS: return True fieldMeta = dcgm_fields.DcgmFieldGetById(fieldId) if fieldMeta.fieldType == dcgm_fields.DCGM_FT_BINARY: return True else: return False
def _GetLatestGpuErrorSamples(self): numErrors = 0 nowStr = time.strftime("%m/%d/%Y %H:%M:%S") self._gpuWatcher.GetMore() for entityGroupId in self._gpuWatcher.values.keys(): for entityId in self._gpuWatcher.values[entityGroupId]: for fieldId in self._gpuWatcher.values[entityGroupId][entityId]: for value in self._gpuWatcher.values[entityGroupId][entityId][fieldId].values: if not value.isBlank and value.value > 0: fieldMeta = dcgm_fields.DcgmFieldGetById(fieldId) print "%s: Got error for GPU %d, field Id %s, value %d" % (nowStr, entityId, fieldMeta.tag, int(value.value)) numErrors += 1 self._gpuWatcher.EmptyValues() if numErrors == 0: print "%s: No GPU errors." % nowStr
# See the License for the specific language governing permissions and # limitations under the License. import pydcgm import dcgm_fields import dcgm_agent_internal import dcgm_structs import time dcgmHandle = pydcgm.DcgmHandle(ipAddress="127.0.0.1") dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetDefaultGroup() #Discover which fieldIds are valid g_fieldTags = {} for fieldId in range(1, dcgm_fields.DCGM_FI_MAX_FIELDS): fieldMeta = dcgm_fields.DcgmFieldGetById(fieldId) if fieldMeta is None: continue g_fieldTags[fieldId] = fieldMeta.tag #print("Found field tags: " + str(g_fieldTags)) fieldIds = sorted(g_fieldTags.keys()) gpuIds = dcgmGroup.GetGpuIds() totalSampleCount = 0 cycleCount = 0