def test_dcgm_action_run_diag_bad_validation(handle, gpuIds): gpuIdStr = "" for i, gpuId in enumerate(gpuIds): if i > 0: gpuIdStr += "," gpuIdStr += str(gpuId) drd = dcgm_structs.c_dcgmRunDiag_t() drd.version = dcgm_structs.dcgmRunDiag_version drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_LONG + 1 #use an invalid value drd.groupId = 0 #Initializing to 0 in case the constructor above doesn't drd.gpuList = gpuIdStr with test_utils.assert_raises(dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)): ret = dcgm_agent.dcgmActionValidate_v2(handle, drd, dcgm_structs.dcgmRunDiag_version)
def helper_dcgm_action_run_diag_gpu_list(handle, gpuIds): ''' Test that running the DCGM diagnostic works if you provide a GPU ID list rather than a groupId. ''' gpuIdStr = "" for i, gpuId in enumerate(gpuIds): if i > 0: gpuIdStr += "," gpuIdStr += str(gpuId) drd = dcgm_structs.c_dcgmRunDiag_t() drd.version = dcgm_structs.dcgmRunDiag_version drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT drd.groupId = 0 #Initializing to 0 in case the constructor above doesn't drd.gpuList = gpuIdStr #this will throw an exception on error ret = dcgm_agent.dcgmActionValidate_v2(handle, drd, dcgm_structs.dcgmRunDiag_version)
def helper_check_diag_empty_group(handle, gpuIds): handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") runDiagInfo = dcgm_structs.c_dcgmRunDiag_t() runDiagInfo.version = dcgm_structs.dcgmRunDiag_version runDiagInfo.groupId = groupObj.GetId() runDiagInfo.validate = 1 with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_GROUP_IS_EMPTY)): response = test_utils.action_validate_wrapper(runDiagInfo, handle) # Now make sure everything works well with a group groupObj.AddGpu(gpuIds[0]) response = test_utils.action_validate_wrapper(runDiagInfo, handle) assert response, "Should have received a response now that we have a non-empty group"
def __init__(self, gpuIds=None, testNamesStr='', paramsStr='', verbose=True, train=False, forceTrain=False, version=dcgm_structs.dcgmRunDiag_version): # Make sure version is valid if version not in DcgmDiag._versionMap: raise ValueError("'%s' is not a valid version for dcgmRunDiag." % version) self.version = version if self.version == dcgm_structs.dcgmRunDiag_version7: self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7() else: self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_t() self.numTests = 0 self.numParams = 0 self.SetVerbose(verbose) if testNamesStr == '': # default to a level 1 test self.runDiagInfo.validate = 1 elif testNamesStr == '1': self.runDiagInfo.validate = 1 elif testNamesStr == '2': self.runDiagInfo.validate = 2 elif testNamesStr == '3': self.runDiagInfo.validate = 3 else: # Make sure no number other that 1-3 were submitted if testNamesStr.isdigit(): raise ValueError("'%s' is not a valid test name." % testNamesStr) # Copy to the testNames portion of the object names = testNamesStr.split(',') if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES: err = 'DcgmDiag cannot initialize: %d test names were specified exceeding the limit of %d.' %\ (len(names), dcgm_structs.DCGM_MAX_TEST_NAMES) raise ValueError(err) for testName in names: self.AddTest(testName) if paramsStr != '': params = paramsStr.split(';') if len(params) >= dcgm_structs.DCGM_MAX_TEST_PARMS: err = 'DcgmDiag cannot initialize: %d parameters were specified, exceeding the limit of %d.' %\ (len(params), dcgm_structs.DCGM_MAX_TEST_PARMS) raise ValueError(err) for param in params: self.AddParameter(param) if train == True: self.runDiagInfo.flags = dcgm_structs.DCGM_RUN_FLAGS_TRAIN if forceTrain == True: self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_FORCE_TRAIN if gpuIds: first = True for gpu in gpuIds: if first: self.runDiagInfo.gpuList = str(gpu) first = False else: self.runDiagInfo.gpuList = "%s,%s" % ( self.runDiagInfo.gpuList, str(gpu))
def test_dcgm_run_diagnostic_validate(handle, gpuIds): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList ) >= 0, "Not able to find devices on the node for embedded case" groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) status_handle = dcgm_agent.dcgmStatusCreate() diagLevel = dcgm_structs.DCGM_DIAG_LVL_SHORT gpuIdStr = "" for i, gpuId in enumerate(gpuIds): if i > 0: gpuIdStr += "," gpuIdStr += str(gpuId) drd = dcgm_structs.c_dcgmRunDiag_t() drd.version = dcgm_structs.dcgmRunDiag_version drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT drd.groupId = groupId drd.gpuList = gpuIdStr with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmActionValidate_v2(handle, drd, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmActionValidate_v2(handle, drd, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmActionValidate(handle, drd.groupId, drd.validate, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmActionValidate(handle, drd.groupId, drd.validate, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmRunDiagnostic(handle, drd.groupId, diagLevel, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmRunDiagnostic(handle, drd.groupId, diagLevel, versionTest)