示例#1
0
文件: DcgmGroup.py 项目: omertuc/DCGM
    def Set(self, config):
        status = pydcgm.DcgmStatus()
        ret = dcgm_structs.DCGM_ST_OK

        try:
            ret = dcgm_agent.dcgmConfigSet(self._dcgmHandle.handle,
                                           self._groupId, config,
                                           status.handle)
        except dcgm_structs.DCGMError as e:
            pass

        #Throw specific errors before return error
        status.ThrowExceptionOnErrors()
        #Throw an appropriate exception on error
        dcgm_structs._dcgmCheckReturn(ret)
def vtDcgmActionValidate_v2(dcgm_handle, runDiagInfo, versionTest):
    response = dcgm_structs.c_dcgmDiagResponse_v7()
    response.version = dcgm_structs.make_dcgm_version(response, 7)
    logger.debug("Structure version: %d" % response.version)

    runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
    runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
    logger.debug("Structure version: %d" % runDiagInfo.version)

    runDiagInfo.version = versionTest
    response.version = versionTest
    fn = dcgmFP("dcgmActionValidate_v2")
    ret = fn(dcgm_handle, byref(runDiagInfo), byref(response))
    dcgm_structs._dcgmCheckReturn(ret)
    return response
示例#3
0
文件: dcgm_agent.py 项目: NVIDIA/DCGM
def dcgmProfWatchFields(dcgmHandle, fieldIds, groupId, updateFreq, maxKeepAge,
                        maxKeepSamples):
    msg = dcgm_structs.c_dcgmProfWatchFields_v1()
    msg.version = dcgm_structs.dcgmProfWatchFields_version1
    msg.groupId = groupId
    msg.updateFreq = updateFreq
    msg.maxKeepAge = maxKeepAge
    msg.maxKeepSamples = maxKeepSamples
    msg.numFieldIds = c_uint32(len(fieldIds))
    for i, fieldId in enumerate(fieldIds):
        msg.fieldIds[i] = fieldId

    fn = dcgmFP("dcgmProfWatchFields")
    ret = fn(dcgmHandle, byref(msg))
    dcgm_structs._dcgmCheckReturn(ret)
    return msg
示例#4
0
def vtDcgmVgpuConfigGet(dcgm_handle, group_id, reqCfgType, count,
                        status_handle, versionTest):
    fn = dcgm_structs._dcgmGetFunctionPointer("dcgmVgpuConfigSet")

    vgpu_config_values_array = count * dcgm_structs.c_dcgmDeviceVgpuConfig_v1
    c_config_values = vgpu_config_values_array()

    vgpuConfig = dcgm_structs.c_dcgmDeviceVgpuConfig_v1()
    vgpuConfig.version = dcgm_structs.make_dcgm_version(vgpuConfig, 1)
    logger.debug("Structure version: %d" % vgpuConfig.version)

    for index in range(0, count):
        c_config_values[index].version = versionTest

    ret = fn(dcgm_handle, group_id, c_config_values, status_handle)
    dcgm_structs._dcgmCheckReturn(ret)
    return map(None, c_config_values[0:count])
示例#5
0
文件: dcgm_agent.py 项目: NVIDIA/DCGM
def dcgmGetDeviceAttributes(
        dcgm_handle,
        gpuId,
        version=dcgm_structs.dcgmDeviceAttributes_version3):
    fn = dcgmFP("dcgmGetDeviceAttributes")
    if version == dcgm_structs.dcgmDeviceAttributes_version3:
        device_values = dcgm_structs.c_dcgmDeviceAttributes_v3()
        device_values.version = dcgm_structs.dcgmDeviceAttributes_version3
    elif version == dcgm_structs.dcgmDeviceAttributes_version2:
        device_values = dcgm_structs.c_dcgmDeviceAttributes_v2()
        device_values.version = dcgm_structs.dcgmDeviceAttributes_version2
    else:
        dcgm_structs._dcgmCheckReturn(dcgm_structs.DCGM_ST_VER_MISMATCH)

    ret = fn(dcgm_handle, c_int(gpuId), byref(device_values))
    dcgm_structs._dcgmCheckReturn(ret)
    return device_values
示例#6
0
def vtDcgmIntrospectGetFieldsMemoryUsage(dcgm_handle,
                                         introspectContext,
                                         versionTest,
                                         waitIfNoData=True):
    fn = dcgmFP("dcgmIntrospectGetFieldsMemoryUsage")

    memInfo = dcgm_structs.c_dcgmIntrospectFullMemory_v1()
    memInfo.version = dcgm_structs.make_dcgm_version(memInfo, 1)
    logger.debug("Structure version: %d" % memInfo.version)

    memInfo.version = versionTest

    introspectContext = dcgm_structs.c_dcgmIntrospectContext_v1()
    introspectContext.version = versionTest

    ret = fn(dcgm_handle, byref(introspectContext), byref(memInfo),
             waitIfNoData)
    dcgm_structs._dcgmCheckReturn(ret)
    return memInfo
示例#7
0
def vtDcgmPolicyGet(dcgm_handle, group_id, count, status_handle, versionTest):
    fn = dcgmFP("dcgmPolicyGet")
    policy_array = count * dcgm_structs.c_dcgmPolicy_v1

    c_policy_values = policy_array()

    policy = dcgm_structs.c_dcgmPolicy_v1()
    policy.version = dcgm_structs.make_dcgm_version(policy, 1)
    logger.debug("Structure version: %d" % policy.version)

    policyCallback = dcgm_structs.c_dcgmPolicyCallbackResponse_v1()
    policyCallback.version = dcgm_structs.make_dcgm_version(policyCallback, 1)
    logger.debug("Structure version: %d" % policyCallback.version)

    for index in range(0, count):
        c_policy_values[index].version = versionTest

    ret = fn(dcgm_handle, group_id, count, c_policy_values, status_handle)
    dcgm_structs._dcgmCheckReturn(ret)
    return c_policy_values[0:count]
示例#8
0
文件: DcgmGroup.py 项目: NVIDIA/DCGM
    def Delete(self):
        del self.config
        self.config = None
        del self.samples
        self.samples = None
        del self.health
        self.health = None
        del self.policy
        self.policy = None
        del self.discovery
        self.discovery = None
        del self.stats
        self.stats = None
        del self.action
        self.action = None
        del self.profiling
        self.profiling = None

        #Delete the group we created if we're not using the special all-GPU group
        if self._groupId is not None and not self._IsGroupIdStatic():
            ret = dcgm_agent.dcgmGroupDestroy(self._dcgmHandle.handle, self._groupId)
            dcgm_structs._dcgmCheckReturn(ret)

        self._groupId = None
示例#9
0
文件: DcgmGroup.py 项目: NVIDIA/DCGM
 def StartJobStats(self, jobId):
     ret = dcgm_agent.dcgmJobStartStats(self._dcgmHandle.handle, self._groupId, jobId)
     dcgm_structs._dcgmCheckReturn(ret)
示例#10
0
文件: DcgmGroup.py 项目: NVIDIA/DCGM
 def Set(self, systems, updateInterval = None, maxKeepAge = None):
     if updateInterval is None or maxKeepAge is None:
         ret = dcgm_agent.dcgmHealthSet(self._dcgmHandle.handle, self._groupId, systems)
     else:
         ret = dcgm_agent.dcgmHealthSet_v2(self._dcgmHandle.handle, self._groupId, systems, updateInterval, maxKeepAge)
     dcgm_structs._dcgmCheckReturn(ret)
示例#11
0
文件: DcgmGroup.py 项目: NVIDIA/DCGM
 def WatchJobFields(self, updateFreq, maxKeepAge, maxKeepSamples):
     ret = dcgm_agent.dcgmWatchJobFields(self._dcgmHandle.handle, self._groupId, updateFreq, maxKeepAge, maxKeepSamples)
     dcgm_structs._dcgmCheckReturn(ret)
示例#12
0
def dcgmProfResume(dcgmHandle):
    fn = dcgmFP("dcgmProfResume")
    ret = fn(dcgmHandle)
    dcgm_structs._dcgmCheckReturn(ret)
    return ret
示例#13
0
文件: DcgmGroup.py 项目: NVIDIA/DCGM
 def UnwatchFields(self, fieldGroup):
     ret = dcgm_agent.dcgmUnwatchFields(self._dcgmHandle.handle, self._groupId, fieldGroup.fieldGroupId)
     dcgm_structs._dcgmCheckReturn(ret)
示例#14
0
def dcgmWatchJobFields(dcgm_handle, groupId, updateFreq, maxKeepAge, maxKeepSamples):
    fn = dcgmFP("dcgmWatchJobFields")
    ret = fn(dcgm_handle, groupId, c_int64(updateFreq), c_double(maxKeepAge), c_int32(maxKeepSamples))
    dcgm_structs._dcgmCheckReturn(ret)
    return ret
示例#15
0
def dcgmDisconnect(dcgm_handle):
    fn = dcgmFP("dcgmDisconnect")
    ret = fn(dcgm_handle)
    dcgm_structs._dcgmCheckReturn(ret)
    return ret
示例#16
0
def dcgmGetDeviceTopology(dcgm_handle, gpuId):
    devtopo = dcgm_structs.c_dcgmDeviceTopology_v1()
    fn = dcgmFP("dcgmGetDeviceTopology")
    ret = fn(dcgm_handle, gpuId, byref(devtopo))
    dcgm_structs._dcgmCheckReturn(ret)
    return devtopo
示例#17
0
def dcgmSelectGpusByTopology(dcgmHandle, inputGpuIds, numGpus, hintFlags):
    fn = dcgmFP("dcgmSelectGpusByTopology")
    outputGpuIds = c_int64()
    ret = fn(dcgmHandle, c_uint64(inputGpuIds), c_uint32(numGpus), byref(outputGpuIds), c_uint64(hintFlags))
    dcgm_structs._dcgmCheckReturn(ret)
    return outputGpuIds
示例#18
0
文件: DcgmGroup.py 项目: NVIDIA/DCGM
    def AddGpu(self, gpuId):
        if self._IsGroupIdStatic():
            raise pydcgm.DcgmException("Can't add a GPU to a static group")

        ret = dcgm_agent.dcgmGroupAddDevice(self._dcgmHandle.handle, self._groupId, gpuId)
        dcgm_structs._dcgmCheckReturn(ret)
示例#19
0
文件: DcgmGroup.py 项目: NVIDIA/DCGM
    def RemoveGpu(self, gpuId):
        if self._IsGroupIdStatic():
            raise pydcgm.DcgmException("Can't remove a GPU from a static group")

        ret = dcgm_agent.dcgmGroupRemoveDevice(self._dcgmHandle.handle, self._groupId, gpuId)
        dcgm_structs._dcgmCheckReturn(ret)
示例#20
0
def dcgmJobRemove(dcgm_handle, jobid):
    fn = dcgmFP("dcgmJobRemove")
    ret = fn(dcgm_handle, jobid)
    dcgm_structs._dcgmCheckReturn(ret)
    return ret
示例#21
0
def dcgmConnect(ip_address):
    dcgm_handle = c_void_p()
    fn = dcgmFP("dcgmConnect")
    ret = fn(ip_address, byref(dcgm_handle))
    dcgm_structs._dcgmCheckReturn(ret)
    return dcgm_handle
示例#22
0
def dcgmJobStopStats(dcgm_handle, jobid):
    fn = dcgmFP("dcgmJobStopStats")
    ret = fn(dcgm_handle, jobid)
    dcgm_structs._dcgmCheckReturn(ret)
    return ret
示例#23
0
文件: DcgmGroup.py 项目: NVIDIA/DCGM
 def StopJobStats(self, jobId):
     ret = dcgm_agent.dcgmJobStopStats(self._dcgmHandle.handle, jobId)
     dcgm_structs._dcgmCheckReturn(ret)
示例#24
0
def dcgmModuleBlacklist(dcgmHandle, moduleId):
    fn = dcgmFP("dcgmModuleBlacklist")
    ret = fn(dcgmHandle, c_uint32(moduleId))
    dcgm_structs._dcgmCheckReturn(ret)
    return ret
示例#25
0
def dcgmJobRemoveAll(dcgm_handle):
    fn = dcgmFP("dcgmJobRemoveAll")
    ret = fn(dcgm_handle)
    dcgm_structs._dcgmCheckReturn(ret)
    return ret
示例#26
0
def dcgmGetGroupTopology(dcgm_handle, groupId):
    grouptopo = dcgm_structs.c_dcgmGroupTopology_v1()
    fn = dcgmFP("dcgmGetGroupTopology")
    ret = fn(dcgm_handle, groupId, byref(grouptopo))
    dcgm_structs._dcgmCheckReturn(ret)
    return grouptopo
示例#27
0
文件: DcgmGroup.py 项目: NVIDIA/DCGM
    def AddEntity(self, entityGroupId, entityId):
        if self._IsGroupIdStatic():
            raise pydcgm.DcgmException("Can't add an entity to a static group")

        ret = dcgm_agent.dcgmGroupAddEntity(self._dcgmHandle.handle, self._groupId, entityGroupId, entityId)
        dcgm_structs._dcgmCheckReturn(ret)
示例#28
0
def dcgmIntrospectToggleState(dcgm_handle, enabledState):
    fn = dcgmFP("dcgmIntrospectToggleState")
    ret = fn(dcgm_handle, enabledState)
    dcgm_structs._dcgmCheckReturn(ret)
    return ret
示例#29
0
文件: DcgmGroup.py 项目: NVIDIA/DCGM
    def RemoveEntity(self, entityGroupId, entityId):
        if self._IsGroupIdStatic():
            raise pydcgm.DcgmException("Can't remove an entity from a static group")

        ret = dcgm_agent.dcgmGroupRemoveEntity(self._dcgmHandle.handle, self._groupId, entityGroupId, entityId)
        dcgm_structs._dcgmCheckReturn(ret)
示例#30
0
def dcgmIntrospectUpdateAll(dcgmHandle, waitForUpdate):
    fn = dcgmFP("dcgmIntrospectUpdateAll")
    ret = fn(dcgmHandle, c_int(waitForUpdate))
    dcgm_structs._dcgmCheckReturn(ret)