예제 #1
0
파일: DcgmGroup.py 프로젝트: NVIDIA/DCGM
    def Validate(self, validate):
        runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
        runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
        runDiagInfo.validate = validate
        runDiagInfo.groupId = self._groupId

        ret = dcgm_agent.dcgmActionValidate_v2(self._dcgmHandle.handle, runDiagInfo)
        return ret
예제 #2
0
파일: DcgmGroup.py 프로젝트: NVIDIA/DCGM
 def RunSpecificTest(self, testName):
     runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
     runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
     for i in range(len(testName)):
         runDiagInfo.testNames[0][i] = testName[i]
     runDiagInfo.groupId = self._groupId
     runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_NONE
     response = dcgm_agent.dcgmActionValidate_v2(self._dcgmHandle.handle, runDiagInfo)
     return response
예제 #3
0
def helper_check_diag_empty_group(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")
    runDiagInfo = dcgm_structs.c_dcgmRunDiag_t()
    runDiagInfo.version = dcgm_structs.dcgmRunDiag_version
    runDiagInfo.groupId = groupObj.GetId()
    runDiagInfo.validate = 1

    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_GROUP_IS_EMPTY)):
        dcgm_agent.dcgmActionValidate_v2(handle, runDiagInfo)

    # Now make sure everything works well with a group
    groupObj.AddGpu(gpuIds[0])
    response = dcgm_agent.dcgmActionValidate_v2(handle, runDiagInfo)
    assert response, "Should have received a response now that we have a non-empty group"
예제 #4
0
def test_dcgm_action_run_diag_bad_validation(handle, gpuIds):
    gpuIdStr = ""
    for i, gpuId in enumerate(gpuIds):
        if i > 0:
            gpuIdStr += ","
        gpuIdStr += str(gpuId)

    drd = dcgm_structs.c_dcgmRunDiag_t()
    drd.version = dcgm_structs.dcgmRunDiag_version
    drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_LONG + 1 #use an invalid value
    drd.groupId = 0 #Initializing to 0 in case the constructor above doesn't
    drd.gpuList = gpuIdStr

    with test_utils.assert_raises(dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)):
        ret = dcgm_agent.dcgmActionValidate_v2(handle, drd, dcgm_structs.dcgmRunDiag_version)
예제 #5
0
def helper_dcgm_action_run_diag_gpu_list(handle, gpuIds):
    '''
    Test that running the DCGM diagnostic works if you provide a GPU ID list rather
    than a groupId.
    '''
    gpuIdStr = ""
    for i, gpuId in enumerate(gpuIds):
        if i > 0:
            gpuIdStr += ","
        gpuIdStr += str(gpuId)

    drd = dcgm_structs.c_dcgmRunDiag_t()
    drd.version = dcgm_structs.dcgmRunDiag_version
    drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT
    drd.groupId = 0 #Initializing to 0 in case the constructor above doesn't
    drd.gpuList = gpuIdStr
    #this will throw an exception on error
    ret = dcgm_agent.dcgmActionValidate_v2(handle, drd, dcgm_structs.dcgmRunDiag_version)
예제 #6
0
def check_gpu_diagnostic(handleObj, settings):
    runDiagInfo, activeGpuIds = initialize_run_diag_info(settings)
    if len(activeGpuIds) == 0:
        return

    response = dcgm_agent.dcgmActionValidate_v2(handleObj.handle, runDiagInfo)

    sysError = response.systemError
    if (sysError.code != dcgm_errors.DCGM_FR_OK):
        raise ValueError(sysError)

    if check_passive_health_checks(response, activeGpuIds) == False:
        for gpuIndex in range(response.gpuCount):
            for testIndex in range(dcgm_structs.DCGM_PER_GPU_TEST_COUNT):
                if response.perGpuResponses[gpuIndex].results[
                        testIndex].result == dcgm_structs.DCGM_DIAG_RESULT_FAIL:
                    gpuId = response.perGpuResponses[gpuIndex].gpuId
                    mark_entity_unhealthy(
                        g_gpus, gpuId, BR_ST_FAILED_ACTIVE_HEALTH, response.
                        perGpuResponses[gpuIndex].results[testIndex].warning)

                    # NVVS marks all subsequent tests as failed so there's no point in continuing
                    break
예제 #7
0
파일: DcgmDiag.py 프로젝트: NVIDIA/DCGM
 def Execute(self, handle):
     return dcgm_agent.dcgmActionValidate_v2(handle, self.runDiagInfo,
                                             self.version)
예제 #8
0
        ## identify the newly created group.
        runDiagInfo.groupId = dcgm_agent.dcgmGroupCreate(
            handle, dcgm_structs.DCGM_GROUP_DEFAULT, "all_gpus_group")

        ## Invoke method to get information on the newly created group
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, runDiagInfo.groupId)

        ## define the actions and validations for those actions to take place
        runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT

        ## This will go ahead and perform a "prologue" diagnostic
        ## to make sure everything is ready to run
        ## currently this calls an outside diagnostic binary but eventually
        ## that binary will be merged into the DCGM framework
        ## The "response" is a dcgmDiagResponse structure that can be parsed for errors
        response = dcgm_agent.dcgmActionValidate_v2(handle, runDiagInfo)

        ## This will perform an "eiplogue" diagnostic that will stress the system
        ## Currently commented out because it takes several minutes to execute
        # runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_SV_LONG
        #response = dcgm_agent.dcgmActionValidate_v2(handle, dcgmRunDiagInfo)

        ## prime the policy manager to look for ECC, PCIe events
        ## if a callback occurs the function above is called. Currently the data returned
        ## corresponds to the error that occurred (PCI, DBE, etc.) but in the future it will be a
        ## dcgmPolicyViolation_t or similar
        ret = dcgm_agent.dcgmPolicyRegister(
            handle, runDiagInfo.groupId, dcgm_structs.DCGM_POLICY_COND_PCI
            | dcgm_structs.DCGM_POLICY_COND_DBE, None, c_callback)

        ## trigger the policy loop
예제 #9
0
 def test_dcgm_run_diag(drd, version):
     drd.validate = 1  # run a short test
     drd.gpuList = str(gpuId)
     # This will throw an exception on error
     ret = dcgm_agent.dcgmActionValidate_v2(handle, drd, version)