def Set(self, config): status = pydcgm.DcgmStatus() ret = dcgm_structs.DCGM_ST_OK try: ret = dcgm_agent.dcgmConfigSet(self._dcgmHandle.handle, self._groupId, config, status.handle) except dcgm_structs.DCGMError as e: pass #Throw specific errors before return error status.ThrowExceptionOnErrors() #Throw an appropriate exception on error dcgm_structs._dcgmCheckReturn(ret)
def vtDcgmActionValidate_v2(dcgm_handle, runDiagInfo, versionTest): response = dcgm_structs.c_dcgmDiagResponse_v7() response.version = dcgm_structs.make_dcgm_version(response, 7) logger.debug("Structure version: %d" % response.version) runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7() runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7 logger.debug("Structure version: %d" % runDiagInfo.version) runDiagInfo.version = versionTest response.version = versionTest fn = dcgmFP("dcgmActionValidate_v2") ret = fn(dcgm_handle, byref(runDiagInfo), byref(response)) dcgm_structs._dcgmCheckReturn(ret) return response
def dcgmProfWatchFields(dcgmHandle, fieldIds, groupId, updateFreq, maxKeepAge, maxKeepSamples): msg = dcgm_structs.c_dcgmProfWatchFields_v1() msg.version = dcgm_structs.dcgmProfWatchFields_version1 msg.groupId = groupId msg.updateFreq = updateFreq msg.maxKeepAge = maxKeepAge msg.maxKeepSamples = maxKeepSamples msg.numFieldIds = c_uint32(len(fieldIds)) for i, fieldId in enumerate(fieldIds): msg.fieldIds[i] = fieldId fn = dcgmFP("dcgmProfWatchFields") ret = fn(dcgmHandle, byref(msg)) dcgm_structs._dcgmCheckReturn(ret) return msg
def vtDcgmVgpuConfigGet(dcgm_handle, group_id, reqCfgType, count, status_handle, versionTest): fn = dcgm_structs._dcgmGetFunctionPointer("dcgmVgpuConfigSet") vgpu_config_values_array = count * dcgm_structs.c_dcgmDeviceVgpuConfig_v1 c_config_values = vgpu_config_values_array() vgpuConfig = dcgm_structs.c_dcgmDeviceVgpuConfig_v1() vgpuConfig.version = dcgm_structs.make_dcgm_version(vgpuConfig, 1) logger.debug("Structure version: %d" % vgpuConfig.version) for index in range(0, count): c_config_values[index].version = versionTest ret = fn(dcgm_handle, group_id, c_config_values, status_handle) dcgm_structs._dcgmCheckReturn(ret) return map(None, c_config_values[0:count])
def dcgmGetDeviceAttributes( dcgm_handle, gpuId, version=dcgm_structs.dcgmDeviceAttributes_version3): fn = dcgmFP("dcgmGetDeviceAttributes") if version == dcgm_structs.dcgmDeviceAttributes_version3: device_values = dcgm_structs.c_dcgmDeviceAttributes_v3() device_values.version = dcgm_structs.dcgmDeviceAttributes_version3 elif version == dcgm_structs.dcgmDeviceAttributes_version2: device_values = dcgm_structs.c_dcgmDeviceAttributes_v2() device_values.version = dcgm_structs.dcgmDeviceAttributes_version2 else: dcgm_structs._dcgmCheckReturn(dcgm_structs.DCGM_ST_VER_MISMATCH) ret = fn(dcgm_handle, c_int(gpuId), byref(device_values)) dcgm_structs._dcgmCheckReturn(ret) return device_values
def vtDcgmIntrospectGetFieldsMemoryUsage(dcgm_handle, introspectContext, versionTest, waitIfNoData=True): fn = dcgmFP("dcgmIntrospectGetFieldsMemoryUsage") memInfo = dcgm_structs.c_dcgmIntrospectFullMemory_v1() memInfo.version = dcgm_structs.make_dcgm_version(memInfo, 1) logger.debug("Structure version: %d" % memInfo.version) memInfo.version = versionTest introspectContext = dcgm_structs.c_dcgmIntrospectContext_v1() introspectContext.version = versionTest ret = fn(dcgm_handle, byref(introspectContext), byref(memInfo), waitIfNoData) dcgm_structs._dcgmCheckReturn(ret) return memInfo
def vtDcgmPolicyGet(dcgm_handle, group_id, count, status_handle, versionTest): fn = dcgmFP("dcgmPolicyGet") policy_array = count * dcgm_structs.c_dcgmPolicy_v1 c_policy_values = policy_array() policy = dcgm_structs.c_dcgmPolicy_v1() policy.version = dcgm_structs.make_dcgm_version(policy, 1) logger.debug("Structure version: %d" % policy.version) policyCallback = dcgm_structs.c_dcgmPolicyCallbackResponse_v1() policyCallback.version = dcgm_structs.make_dcgm_version(policyCallback, 1) logger.debug("Structure version: %d" % policyCallback.version) for index in range(0, count): c_policy_values[index].version = versionTest ret = fn(dcgm_handle, group_id, count, c_policy_values, status_handle) dcgm_structs._dcgmCheckReturn(ret) return c_policy_values[0:count]
def Delete(self): del self.config self.config = None del self.samples self.samples = None del self.health self.health = None del self.policy self.policy = None del self.discovery self.discovery = None del self.stats self.stats = None del self.action self.action = None del self.profiling self.profiling = None #Delete the group we created if we're not using the special all-GPU group if self._groupId is not None and not self._IsGroupIdStatic(): ret = dcgm_agent.dcgmGroupDestroy(self._dcgmHandle.handle, self._groupId) dcgm_structs._dcgmCheckReturn(ret) self._groupId = None
def StartJobStats(self, jobId): ret = dcgm_agent.dcgmJobStartStats(self._dcgmHandle.handle, self._groupId, jobId) dcgm_structs._dcgmCheckReturn(ret)
def Set(self, systems, updateInterval = None, maxKeepAge = None): if updateInterval is None or maxKeepAge is None: ret = dcgm_agent.dcgmHealthSet(self._dcgmHandle.handle, self._groupId, systems) else: ret = dcgm_agent.dcgmHealthSet_v2(self._dcgmHandle.handle, self._groupId, systems, updateInterval, maxKeepAge) dcgm_structs._dcgmCheckReturn(ret)
def WatchJobFields(self, updateFreq, maxKeepAge, maxKeepSamples): ret = dcgm_agent.dcgmWatchJobFields(self._dcgmHandle.handle, self._groupId, updateFreq, maxKeepAge, maxKeepSamples) dcgm_structs._dcgmCheckReturn(ret)
def dcgmProfResume(dcgmHandle): fn = dcgmFP("dcgmProfResume") ret = fn(dcgmHandle) dcgm_structs._dcgmCheckReturn(ret) return ret
def UnwatchFields(self, fieldGroup): ret = dcgm_agent.dcgmUnwatchFields(self._dcgmHandle.handle, self._groupId, fieldGroup.fieldGroupId) dcgm_structs._dcgmCheckReturn(ret)
def dcgmWatchJobFields(dcgm_handle, groupId, updateFreq, maxKeepAge, maxKeepSamples): fn = dcgmFP("dcgmWatchJobFields") ret = fn(dcgm_handle, groupId, c_int64(updateFreq), c_double(maxKeepAge), c_int32(maxKeepSamples)) dcgm_structs._dcgmCheckReturn(ret) return ret
def dcgmDisconnect(dcgm_handle): fn = dcgmFP("dcgmDisconnect") ret = fn(dcgm_handle) dcgm_structs._dcgmCheckReturn(ret) return ret
def dcgmGetDeviceTopology(dcgm_handle, gpuId): devtopo = dcgm_structs.c_dcgmDeviceTopology_v1() fn = dcgmFP("dcgmGetDeviceTopology") ret = fn(dcgm_handle, gpuId, byref(devtopo)) dcgm_structs._dcgmCheckReturn(ret) return devtopo
def dcgmSelectGpusByTopology(dcgmHandle, inputGpuIds, numGpus, hintFlags): fn = dcgmFP("dcgmSelectGpusByTopology") outputGpuIds = c_int64() ret = fn(dcgmHandle, c_uint64(inputGpuIds), c_uint32(numGpus), byref(outputGpuIds), c_uint64(hintFlags)) dcgm_structs._dcgmCheckReturn(ret) return outputGpuIds
def AddGpu(self, gpuId): if self._IsGroupIdStatic(): raise pydcgm.DcgmException("Can't add a GPU to a static group") ret = dcgm_agent.dcgmGroupAddDevice(self._dcgmHandle.handle, self._groupId, gpuId) dcgm_structs._dcgmCheckReturn(ret)
def RemoveGpu(self, gpuId): if self._IsGroupIdStatic(): raise pydcgm.DcgmException("Can't remove a GPU from a static group") ret = dcgm_agent.dcgmGroupRemoveDevice(self._dcgmHandle.handle, self._groupId, gpuId) dcgm_structs._dcgmCheckReturn(ret)
def dcgmJobRemove(dcgm_handle, jobid): fn = dcgmFP("dcgmJobRemove") ret = fn(dcgm_handle, jobid) dcgm_structs._dcgmCheckReturn(ret) return ret
def dcgmConnect(ip_address): dcgm_handle = c_void_p() fn = dcgmFP("dcgmConnect") ret = fn(ip_address, byref(dcgm_handle)) dcgm_structs._dcgmCheckReturn(ret) return dcgm_handle
def dcgmJobStopStats(dcgm_handle, jobid): fn = dcgmFP("dcgmJobStopStats") ret = fn(dcgm_handle, jobid) dcgm_structs._dcgmCheckReturn(ret) return ret
def StopJobStats(self, jobId): ret = dcgm_agent.dcgmJobStopStats(self._dcgmHandle.handle, jobId) dcgm_structs._dcgmCheckReturn(ret)
def dcgmModuleBlacklist(dcgmHandle, moduleId): fn = dcgmFP("dcgmModuleBlacklist") ret = fn(dcgmHandle, c_uint32(moduleId)) dcgm_structs._dcgmCheckReturn(ret) return ret
def dcgmJobRemoveAll(dcgm_handle): fn = dcgmFP("dcgmJobRemoveAll") ret = fn(dcgm_handle) dcgm_structs._dcgmCheckReturn(ret) return ret
def dcgmGetGroupTopology(dcgm_handle, groupId): grouptopo = dcgm_structs.c_dcgmGroupTopology_v1() fn = dcgmFP("dcgmGetGroupTopology") ret = fn(dcgm_handle, groupId, byref(grouptopo)) dcgm_structs._dcgmCheckReturn(ret) return grouptopo
def AddEntity(self, entityGroupId, entityId): if self._IsGroupIdStatic(): raise pydcgm.DcgmException("Can't add an entity to a static group") ret = dcgm_agent.dcgmGroupAddEntity(self._dcgmHandle.handle, self._groupId, entityGroupId, entityId) dcgm_structs._dcgmCheckReturn(ret)
def dcgmIntrospectToggleState(dcgm_handle, enabledState): fn = dcgmFP("dcgmIntrospectToggleState") ret = fn(dcgm_handle, enabledState) dcgm_structs._dcgmCheckReturn(ret) return ret
def RemoveEntity(self, entityGroupId, entityId): if self._IsGroupIdStatic(): raise pydcgm.DcgmException("Can't remove an entity from a static group") ret = dcgm_agent.dcgmGroupRemoveEntity(self._dcgmHandle.handle, self._groupId, entityGroupId, entityId) dcgm_structs._dcgmCheckReturn(ret)
def dcgmIntrospectUpdateAll(dcgmHandle, waitForUpdate): fn = dcgmFP("dcgmIntrospectUpdateAll") ret = fn(dcgmHandle, c_int(waitForUpdate)) dcgm_structs._dcgmCheckReturn(ret)