def test_dcgm_policy_inject_retiredpages_standalone(handle, gpuIds): """ Verifies that we can inject an error into the retired pages counters and receive a callback """ newPolicy = dcgm_structs.c_dcgmPolicy_v1() newPolicy.version = dcgm_structs.dcgmPolicy_version1 newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED newPolicy.parms[2].tag = 1 newPolicy.parms[2].val.llval = 5 # find a GPU that supports ECC and retired pages (otherwise internal test will ignore it) dcgmHandle = pydcgm.DcgmHandle(handle) dcgmSystem = dcgmHandle.GetSystem() group = dcgmSystem.GetGroupWithGpuIds("test1", gpuIds) group.policy.Set(newPolicy) callbackQueue = Queue.Queue() c_callback = create_c_callback(callbackQueue) group.policy.Register(dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED, finishCallback=c_callback) # inject an error into ECC numPages = 10 field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_RETIRED_DBE field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() + 60) * 1000000.0) # set the injected data into the future field.value.i64 = numPages ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field) assert (ret == dcgm_structs.DCGM_ST_OK) #inject a SBE too so that the health check code gets past its internal checks field.fieldId = dcgm_fields.DCGM_FI_DEV_RETIRED_SBE ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field) assert (ret == dcgm_structs.DCGM_ST_OK) # wait for the the policy manager to call back try: callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS) except Queue.Empty: assert False, "Callback never happened" # check that the callback occurred with the correct arguments assert(dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED == callbackData.condition), \ ("error callback was not for a retired pages, got: %s" % callbackData.condition) assert(numPages == callbackData.val.mpr.dbepages), \ 'Expected %s errors but got %s' % (numPages, callbackData.val.mpr.dbepages)
def test_dcgm_policy_inject_nvlinkerror_standalone(handle, gpuIds): """ Verifies that we can inject an error into the NVLINK error and receive a callback """ newPolicy = dcgm_structs.c_dcgmPolicy_v1() newPolicy.version = dcgm_structs.dcgmPolicy_version1 newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_NVLINK newPolicy.parms[5].tag = 0 newPolicy.parms[5].val.boolean = True # find a GPU that supports nvlink (otherwise internal test will ignore it) dcgmHandle = pydcgm.DcgmHandle(handle) dcgmSystem = dcgmHandle.GetSystem() group = dcgmSystem.GetGroupWithGpuIds('test1', gpuIds) group.policy.Set(newPolicy) callbackQueue = Queue.Queue() c_callback = create_c_callback(callbackQueue) group.policy.Register(dcgm_structs.DCGM_POLICY_COND_NVLINK, finishCallback=c_callback) field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() + 60) * 1000000.0) # set the injected data into the future field.value.i64 = 1 ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field) assert (ret == dcgm_structs.DCGM_ST_OK) # wait for the the policy manager to call back try: callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS) except Queue.Empty: assert False, "Callback never happened" # check that the callback occurred with the correct arguments assert(dcgm_structs.DCGM_POLICY_COND_NVLINK == callbackData.condition), \ ("NVLINK error callback was not for a NVLINK error, got: %s" % callbackData.condition) assert(dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL == callbackData.val.nvlink.fieldId), \ ("Expected 130 fieldId but got %s" % callbackData.val.nvlink.fieldId) assert ( 1 == callbackData.val.nvlink.counter ), 'Expected 1 PCI error but got %s' % callbackData.val.nvlink.counter
def helper_dcgm_policy_inject_eccerror(handle, gpuIds): """ Verifies that we can inject an error into the ECC counters and receive a callback """ newPolicy = dcgm_structs.c_dcgmPolicy_v1() newPolicy.version = dcgm_structs.dcgmPolicy_version1 newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_DBE newPolicy.parms[0].tag = 0 newPolicy.parms[0].val.boolean = True dcgmHandle = pydcgm.DcgmHandle(handle) dcgmSystem = dcgmHandle.GetSystem() group = dcgmSystem.GetGroupWithGpuIds("test1", gpuIds) group.policy.Set(newPolicy) # the order of the callbacks will change once implementation is complete callbackQueue = Queue.Queue() c_callback = create_c_callback(callbackQueue) group.policy.Register(dcgm_structs.DCGM_POLICY_COND_DBE, c_callback, None) # inject an error into ECC field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_DEV field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() + 60) * 1000000.0) # set the injected data into the future field.value.i64 = 1 logger.debug("injecting %s for gpuId %d" % (str(field), gpuIds[0])) ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field) assert (ret == dcgm_structs.DCGM_ST_OK) # wait for the the policy manager to call back try: callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS) except Queue.Empty: assert False, "Callback never happened" # check that the callback occurred with the correct arguments assert(dcgm_structs.DCGM_POLICY_COND_DBE == callbackData.condition), \ ("error callback was not for a DBE error, got: %s" % callbackData.condition) assert ( 1 == callbackData.val.dbe.numerrors ), 'Expected 1 DBE error but got %s' % callbackData.val.dbe.numerrors assert(dcgm_structs.c_dcgmPolicyConditionDbe_t.LOCATIONS['DEVICE'] == callbackData.val.dbe.location), \ 'got: %s' % callbackData.val.dbe.location
def helper_dcgm_policy_inject_pcierror(handle, gpuIds): """ Verifies that we can inject an error into the PCI counters and receive a callback """ newPolicy = dcgm_structs.c_dcgmPolicy_v1() newPolicy.version = dcgm_structs.dcgmPolicy_version1 newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_PCI newPolicy.parms[1].tag = 1 newPolicy.parms[1].val.llval = 0 gpuId = gpuIds[0] group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle), groupName="test1", groupType=dcgm_structs.DCGM_GROUP_EMPTY) group.AddGpu(gpuId) group.policy.Set(newPolicy) callbackQueue = Queue.Queue() c_callback = create_c_callback(callbackQueue) group.policy.Register(dcgm_structs.DCGM_POLICY_COND_PCI, finishCallback=c_callback) field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() + 60) * 1000000.0) # set the injected data into the future field.value.i64 = 1 ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, field) assert (ret == dcgm_structs.DCGM_ST_OK) # wait for the the policy manager to call back try: callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS) except Queue.Empty: assert False, "Callback never happened" # check that the callback occurred with the correct arguments assert(dcgm_structs.DCGM_POLICY_COND_PCI == callbackData.condition), \ ("PCI error callback was not for a PCI error, got: %s" % callbackData.condition) assert (1 == callbackData.val.pci.counter ), 'Expected 1 PCI error but got %s' % callbackData.val.pci.counter
def test_dcgm_policy_set_get_violation_policy_standalone(handle, gpuIds): """ Verifies that set and get violation policy work """ dcgmHandle = pydcgm.DcgmHandle(handle) dcgmSystem = dcgmHandle.GetSystem() group = dcgmSystem.GetGroupWithGpuIds("test1", gpuIds) newPolicy = dcgm_structs.c_dcgmPolicy_v1() newPolicy.version = dcgm_structs.dcgmPolicy_version1 newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_DBE newPolicy.parms[0].tag = 0 newPolicy.parms[0].val.boolean = True group.policy.Set(newPolicy) policies = group.policy.Get() _assert_policies_equal(policies[0], newPolicy)
def vtDcgmPolicyGet(dcgm_handle, group_id, count, status_handle, versionTest): fn = dcgmFP("dcgmPolicyGet") policy_array = count * dcgm_structs.c_dcgmPolicy_v1 c_policy_values = policy_array() policy = dcgm_structs.c_dcgmPolicy_v1() policy.version = dcgm_structs.make_dcgm_version(policy, 1) logger.debug("Structure version: %d" % policy.version) policyCallback = dcgm_structs.c_dcgmPolicyCallbackResponse_v1() policyCallback.version = dcgm_structs.make_dcgm_version(policyCallback, 1) logger.debug("Structure version: %d" % policyCallback.version) for index in range(0, count): c_policy_values[index].version = versionTest ret = fn(dcgm_handle, group_id, count, c_policy_values, status_handle) dcgm_structs._dcgmCheckReturn(ret) return c_policy_values[0:count]
def helper_test_dcgm_policy_inject_xiderror(handle, gpuIds): """ Verifies that we can inject an XID error and receive a callback """ newPolicy = dcgm_structs.c_dcgmPolicy_v1() newPolicy.version = dcgm_structs.dcgmPolicy_version1 newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_XID newPolicy.parms[6].tag = 0 newPolicy.parms[6].val.boolean = True dcgmHandle = pydcgm.DcgmHandle(handle) validDeviceId = -1 devices = gpuIds for x in devices: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [ dcgm_fields.DCGM_FI_DEV_XID_ERRORS, ]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDeviceId = x break if (validDeviceId == -1): test_utils.skip_test( "Can only run if at least one GPU that supports XID errors is present" ) group = pydcgm.DcgmGroup(dcgmHandle, groupName="test1", groupType=dcgm_structs.DCGM_GROUP_EMPTY) group.AddGpu(validDeviceId) group.policy.Set(newPolicy) callbackQueue = Queue.Queue() c_callback = create_c_callback(callbackQueue) group.policy.Register(dcgm_structs.DCGM_POLICY_COND_XID, finishCallback=c_callback) field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_XID_ERRORS field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() + 60) * 1000000.0) # set the injected data into the future field.value.i64 = 16 ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, validDeviceId, field) assert (ret == dcgm_structs.DCGM_ST_OK) # wait for the the policy manager to call back try: callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS) except Queue.Empty: assert False, "Callback never happened" # check that the callback occurred with the correct arguments assert(dcgm_structs.DCGM_POLICY_COND_XID == callbackData.condition), \ ("XID error callback was not for a XID error, got: %s" % callbackData.condition) assert (16 == callbackData.val.xid.errnum), ( 'Expected XID error 16 but got %s' % callbackData.val.xid.errnum)
ret = dcgm_agent.dcgmStatusDestroy(status_handle) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to remove status handler, error: %s" % ret def callback_function(data): global callbackCalled callbackCalled = True c_callback = C_FUNC(callback_function) dcgm_structs._LoadDcgmLibrary() newPolicy = dcgm_structs.c_dcgmPolicy_v1() handle = dcgm_agent.dcgmInit() newPolicy.version = dcgm_structs.dcgmPolicy_version1 newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED newPolicy.action = dcgm_structs.DCGM_POLICY_ACTION_GPURESET newPolicy.validation = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT newPolicy.parms[2].tag = 1 newPolicy.parms[2].val.llval = 5 # find a GPU that supports retired pages (otherwise internal test will ignore it) devices = dcgm_agent.dcgmGetAllDevices(handle) validDevice = -1 for x in devices: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [