Пример #1
0
def helper_check_software_page_retirements_fail_on_pending_retirements(handle, gpuId):
    """
    Ensure that the software test for page retirements fails when there are pending page retirements.
    """
    # First verify that the software test passes for the gpu.
    # If it doesn't pass, skip test and add note to check GPU health
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId])
    dd.UseFakeGpus()
    response = test_utils.diag_execute_wrapper(dd, handle)
    if not check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT):
        test_utils.skip_test("Skipping because GPU %s does not pass software page retirement test. "
                             "Please verify whether the GPU is healthy." % gpuId)

    # Inject some pending page retirements
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING, 1, -30, True)
    response = test_utils.diag_execute_wrapper(dd, handle)
    # Ensure software test failed due to pending page retirments
    assert check_software_result_fail(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \
        "Expected software test to fail due to pending page retirements in the GPU"

    # Reset injected value
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING, 0, -30, True)
    # Ensure diag passes now
    response = test_utils.diag_execute_wrapper(dd, handle)
    assert check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \
        "Expected software test to pass"
Пример #2
0
def helper_check_software_page_retirements_fail_total_retirements(handle, gpuId):
    """
    Ensure that the software test for page retirements fails when there are mroe than 60 page retirements.
    """
    # First verify that the software test passes for the gpu. If it doesn't pass, skip test and add note to check GPU health
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId])
    dd.UseFakeGpus()
    response = test_utils.diag_execute_wrapper(dd, handle)
    if not check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT):
        test_utils.skip_test("Skipping because GPU %s does not pass software page retirement test. "
                             "Please verify whether the GPU is healthy." % gpuId)

    # Inject enough page retirements to cause failure
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, 33, -30, True)
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, 33, -30, True)
    response = test_utils.diag_execute_wrapper(dd, handle)
    assert check_software_result_fail(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \
           "Expected software test to fail due to 60 total page retirements in the GPU"

    # Ensure 59 pages pass injected value
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, 25, -30, True)
    # Ensure diag passes now
    response = test_utils.diag_execute_wrapper(dd, handle)
    assert check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \
           "Expected software test to pass since there are less than 60 total retired pages"

    # Reset retired pages count and verify pass
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, 0, -30, True)
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, 0, -30, True)
    # Ensure diag still passes
    response = test_utils.diag_execute_wrapper(dd, handle)
    assert check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \
           "Expected software test to pass since there are no retired pages"
Пример #3
0
def perform_diag_with_throttle_mask_and_verify(dd, handle, gpuId,
                                               inserted_error, throttle_mask,
                                               shouldPass, failureMsg):
    fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
    interval = 0.1
    if throttle_mask is not None:
        dd.SetThrottleMask(throttle_mask)

    inject_value(handle, gpuId, fieldId, inserted_error, injection_offset,
                 True)
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000,
                 injection_offset, True)
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, inserted_error, checkInterval=interval, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    # Start the diag
    response = test_utils.diag_execute_wrapper(dd, handle)

    # Check for pass or failure as per the shouldPass parameter
    throttled, errMsg = find_throttle_failure(
        response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX)
    if shouldPass:
        assert throttled == False, "Expected to not have a throttling error but found %s" % errMsg
    else:
        assert throttled == True, "Expected to find a throttling error but did not (%s)" % errMsg
Пример #4
0
def helper_test_thermal_violations_in_seconds(handle, gpuIds):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr='diagnostic',
                           paramsStr='diagnostic.test_duration=10')
    dd.UseFakeGpus()
    fieldId = dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION
    injected_value = 2344122048
    inject_value(handle, gpuIds[0], fieldId, injected_value, 10, True)

    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(gpuIds[0], fieldId, injected_value, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    # Start the diag
    response = dd.Execute(handle)

    testIndex = dcgm_structs.DCGM_DIAGNOSTIC_INDEX
    errmsg = response.perGpuResponses[gpuIds[0]].results[testIndex].error.msg
    # Check for hermal instead of thermal because sometimes it's capitalized
    if errmsg.find("hermal violations") != -1:
        foundError = True
        assert errmsg.find("totaling 2.3 seconds") != -1, \
            "Expected 2.3 seconds of thermal violations but found %s" % errmsg
    else:
        # Didn't find an error
        assert False, "Thermal violations were injected but not found in error message: '%s'." % errmsg
Пример #5
0
def helper_test_inject_instance_fields(handle, gpuIds):
    instances, cis = ensure_instance_ids(handle, gpuIds[0], 1, 1)
    firstInstanceId = instances.keys()[0]
    lastCIId = cis.keys()[0]

    # Set up the watches on these groups
    groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                         'tien')
    fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(
        handle, [dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL], 'kal')

    dcgm_agent.dcgmGroupAddEntity(handle, groupId, dcgm_fields.DCGM_FE_GPU,
                                  gpuIds[0])
    dcgm_agent.dcgmGroupAddEntity(handle, groupId, dcgm_fields.DCGM_FE_GPU_I,
                                  firstInstanceId)
    dcgm_agent.dcgmGroupAddEntity(handle, groupId, dcgm_fields.DCGM_FE_GPU_CI,
                                  lastCIId)
    dcgm_agent.dcgmWatchFields(handle, groupId, fieldGroupId, 1, 100, 100)

    dcgm_internal_helpers.inject_value(
        handle,
        gpuIds[0],
        dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
        2,
        5,
        isInt=True,
        verifyInsertion=True,
        entityType=dcgm_fields.DCGM_FE_GPU)

    # Read the values to make sure they were stored properly
    entities = [
        dcgm_structs.c_dcgmGroupEntityPair_t(),
        dcgm_structs.c_dcgmGroupEntityPair_t(),
        dcgm_structs.c_dcgmGroupEntityPair_t()
    ]

    entities[0].entityGroupId = dcgm_fields.DCGM_FE_GPU_I
    entities[0].entityId = firstInstanceId
    entities[1].entityGroupId = dcgm_fields.DCGM_FE_GPU_CI
    entities[1].entityId = lastCIId
    entities[2].entityGroupId = dcgm_fields.DCGM_FE_GPU
    entities[2].entityId = gpuIds[0]

    fieldIds = [dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL]

    values = dcgm_agent.dcgmEntitiesGetLatestValues(handle, entities, fieldIds,
                                                    0)
    for v in values:
        if v.entityGroupId == dcgm_fields.DCGM_FE_GPU:
            assert v.value.i64 == 2, "Failed to inject value 2 for entity %u from group %u" % (
                v.entityId, v.entityGroupId)
        else:
            from dcgm_structs import DCGM_ST_NO_DATA
            assert (
                v.status == DCGM_ST_NO_DATA
            ), "Injected meaningless value %u for entity %u from group %u" % (
                v.value.i64, v.entityId, v.entityGroupId)
Пример #6
0
def test_memtest_failures_standalone(handle, gpuIds):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr="memtest",
                           paramsStr="memtest.test_duration=10")

    inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
                 1000, injection_offset, True)

    response = test_utils.diag_execute_wrapper(dd, handle)

    assert response.perGpuResponses[gpuIds[0]].results[dcgm_structs.DCGM_MEMTEST_INDEX].result != dcgm_structs.DCGM_DIAG_RESULT_PASS, \
                "Should have a failure due to injected DBEs, but got passing result"
Пример #7
0
def helper_test_diagnostic_config_usage(handle, gpuIds):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr="diagnostic",
                           paramsStr="diagnostic.test_duration=10")
    dd.SetConfigFileContents(
        "%YAML 1.2\n\ncustom:\n- custom:\n    diagnostic:\n      max_sbe_errors: 1"
    )

    inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL,
                 1000, injection_offset, True)

    response = test_utils.diag_execute_wrapper(dd, handle)

    assert response.perGpuResponses[gpuIds[0]].results[dcgm_structs.DCGM_DIAGNOSTIC_INDEX].result != dcgm_structs.DCGM_DIAG_RESULT_PASS, \
                "Should have a failure due to injected SBEs, but got passing result"
Пример #8
0
def helper_test_health_check_instances(handle, gpuIds):
    instances, cis = ensure_instance_ids(handle, gpuIds[0], 1, 1)
    instanceId = instances.keys()[0]
    ciId = cis.keys()[0]
    handleObj = DcgmHandle.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")
    groupObj.AddEntity(dcgm_fields.DCGM_FE_GPU, gpuIds[0])
    groupObj.AddEntity(dcgm_fields.DCGM_FE_GPU_I, instanceId)
    groupObj.AddEntity(dcgm_fields.DCGM_FE_GPU_CI, ciId)

    newSystems = dcgm_structs.DCGM_HEALTH_WATCH_MEM
    groupObj.health.Set(newSystems)

    # Verify health prior to testing
    responseV4 = groupObj.health.Check(
        dcgm_structs.dcgmHealthResponse_version4)
    if responseV4.incidentCount != 0:
        test_utils.skip_test("Cannot test on unhealthy systems.")

    # Inject one error per system
    dcgm_internal_helpers.inject_value(
        handle,
        gpuIds[0],
        dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
        2,
        5,
        isInt=True,
        verifyInsertion=True,
        entityType=dcgm_fields.DCGM_FE_GPU)

    responseV4 = groupObj.health.Check(
        dcgm_structs.dcgmHealthResponse_version4)
    assert (
        responseV4.incidentCount == 1
    ), "Should have 1 total incidents but found %d" % responseV4.incidentCount

    assert (responseV4.incidents[0].entityInfo.entityId == gpuIds[0])
    assert (responseV4.incidents[0].entityInfo.entityGroupId ==
            dcgm_fields.DCGM_FE_GPU)
    assert (responseV4.incidents[0].error.code ==
            dcgm_errors.DCGM_FR_VOLATILE_DBE_DETECTED)
    assert (
        responseV4.incidents[0].system == dcgm_structs.DCGM_HEALTH_WATCH_MEM)
    assert (
        responseV4.incidents[0].health == dcgm_structs.DCGM_HEALTH_RESULT_FAIL)
Пример #9
0
def helper_per_gpu_responses_api(handle, gpuIds, testDir):
    """
    Verify that pass/fail status for diagnostic tests are reported on a per GPU basis via dcgmActionValidate API call
    """
    failGpuId = gpuIds[0]
    dd = helper_verify_diag_passing(handle, gpuIds, useFakeGpus=True)

    dd = DcgmDiag.DcgmDiag(gpuIds=[failGpuId],
                           testNamesStr="SM Stress",
                           paramsStr="sm stress.test_duration=15",
                           version=dcgm_structs.dcgmRunDiag_version)
    dd.SetThrottleMask(
        0
    )  # We explicitly want to fail for throttle reasons since this test inserts throttling errors
    # for verification
    dd.UseFakeGpus()
    dd.SetStatsPath(testDir)
    dd.SetStatsOnFail(1)

    # Setup injection app
    fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
    insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN
    interval = 0.1
    # Use an offset to make these errors start after the benign values
    inject_value(handle, failGpuId, fieldId, insertedError, injection_offset,
                 True)
    inject_value(handle, failGpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000,
                 injection_offset, True)
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(failGpuId, fieldId, insertedError, checkInterval=interval, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    response = test_utils.diag_execute_wrapper(dd, handle)
    logger.info("Started diag")

    # Verify that responses are reported on a per gpu basis. Ensure the first GPU failed, and all others passed
    for gpuId in gpuIds:
        throttled, errMsg = find_throttle_failure(
            response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX)
        if gpuId == failGpuId:
            assert throttled, "Expected throttling error but found none (%s)" % errMsg
        else:
            assert not throttled, "Expected not to find a throttling error but found '%s'" % errMsg
Пример #10
0
def helper_throttling_masking_failures(handle, gpuId):
    #####
    # First check whether the GPU is healthy
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId],
                           testNamesStr="SM Stress",
                           paramsStr="sm stress.test_duration=2",
                           version=dcgm_structs.dcgmRunDiag_version)
    dd.SetThrottleMask(
        0
    )  # We explicitly want to fail for throttle reasons since this test inserts throttling errors
    # for verification
    dd.UseFakeGpus()
    response = test_utils.diag_execute_wrapper(dd, handle)
    if not check_diag_result_pass(response, gpuId,
                                  dcgm_structs.DCGM_SM_STRESS_INDEX):
        test_utils.skip_test(
            "Skipping because GPU %s does not pass SM Perf test. "
            "Please verify whether the GPU is supported and healthy." % gpuId)

    #####
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId],
                           testNamesStr="SM Stress",
                           paramsStr="sm stress.test_duration=15",
                           version=dcgm_structs.dcgmRunDiag_version)
    dd.SetThrottleMask(0)
    dd.UseFakeGpus()

    fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
    insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN
    interval = 0.1

    logger.info("Injecting benign errors")
    inject_value(handle, gpuId, fieldId, 3, 1, True)
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, 3, checkInterval=interval, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    logger.info("Injecting actual errors")
    inject_value(handle, gpuId, fieldId, insertedError, injection_offset, True)
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000,
                 injection_offset, True)

    logger.info("Started diag")
    response = test_utils.diag_execute_wrapper(dd, handle)
    # Verify that the inserted values are visible in DCGM
    # Max wait of 8 is because of 5 second offset + 2 seconds required for 20 matches + 1 second buffer.
    assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, insertedError, checkInterval=0.1, numMatches=1, maxWait=8), \
            "Expected inserted errors to be visible in DCGM"

    throttled, errMsg = find_throttle_failure(
        response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX)
    assert throttled, "Expected to find throttling failure, but did not: (%s)" % errMsg
Пример #11
0
def verify_early_fail_checks_for_test(handle, gpuId, test_name, testIndex):
    """
    Helper method for verifying the fail early checks for the specified test.
    """
    if testIndex == dcgm_structs.DCGM_TARGETED_POWER_INDEX and not option_parser.options.developer_mode:
        # Skip this test since Targeted Power always fails when duration is less than 30 seconds
        test_utils.skip_test("Skipping fail early verification for Targeted Power test. Use developer mode "
                             "to run this test.")
    duration = 2 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min
                                                                                # duration requirements for Targeted Power
    paramsStr = "%s.test_duration=%s" % (test_name, duration)

    data = [None]
    def runDiag(dd, data): # Simple helper method to run a diag (used as thread target)
        data[0] = test_utils.diag_execute_wrapper(dd, handle)

    ###
    # First verify that the given test passes for the gpu.
    # If it doesn't pass, skip test and add note to check GPU health
    logger.info("Checking whether %s test passes on GPU %s" % (test_name, gpuId))
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr)
    test_name_no_spaces = test_name.replace(" ", "_")
    logname = '/tmp/nv_' + test_name_no_spaces + '%s.log'
    dd.SetDebugLogFile(logname % 1)
    dd.SetDebugLevel(5)
    response = test_utils.diag_execute_wrapper(dd, handle)
    if not check_diag_result_pass(response, gpuId, testIndex):
        test_utils.skip_test("Skipping because GPU %s does not pass %s test. "
                             "Please verify whether the GPU is healthy." % (gpuId, test_name))

    ###
    # Next, verify that the given test passes for the gpu when fail early checks are enabled and no errors are inserted
    logger.info("Checking whether %s test passes on GPU %s with fail early enabled" % (test_name, gpuId))
    duration = 15 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min
                                                                                 # duration requirements for Targeted Power
    paramsStr = "%s.test_duration=%s" % (test_name, duration)
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr)
    dd.SetFailEarly(checkInterval=2) # enable fail early checks
    dd.SetDebugLogFile(logname % 2)
    dd.SetDebugLevel(5)

    result_thread = threading.Thread(target=runDiag, args=[dd, data])
    result_thread.start()

    # Ensure nvvs process has started
    running, debug_output = dcgm_internal_helpers.check_nvvs_process(want_running=True)
    assert running, "Nvvs process did not start within 10 seconds. pgrep output: %s" % debug_output

    start = time.time()
    result_thread.join()
    end = time.time()

    assert check_diag_result_pass(data[0], gpuId, testIndex), \
        "Expected %s test to pass with fail early enabled and no inserted errors" % test_name
    assert (end - start) >= duration * 0.9, \
        "Expected %s test to run for at least %ss, but it only ran for %ss." % (test_name, duration, end - start)

    ###
    # Verify fail early behavior by inserting an error.
    # Setup test parameters
    duration = 20 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min
                                                                                 # duration requirements for Targeted Power
    paramsStr = "%s.test_duration=%s" % (test_name, duration)
    response = None
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr)
    dd.SetFailEarly(checkInterval=2) # enable fail early checks
    dd.SetDebugLogFile(logname % 3)

    # Setup threads / processes
    xid_inject_val = 2
    result_thread = threading.Thread(target=runDiag, args=[dd, data])
    inject_error = dcgm_internal_helpers.InjectionThread(handle, gpuId,
        dcgm_fields.DCGM_FI_DEV_XID_ERRORS, xid_inject_val, offset=5)

    logger.info("Verifying fail early behavior for %s test by inserting XIDs." % test_name)
    # Start inserting errors
    inject_error.start()
    # Ensure that inserted errors are visible
    assert \
        dcgm_internal_helpers.verify_field_value(gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
                                                 xid_inject_val, checkInterval=0.1, numMatches=5), \
        "Expected inserted value for XIDs to be visible in DCGM"

    # Start test thread
    result_thread.start()
    # Ensure nvvs process has started
    running, debug_output = dcgm_internal_helpers.check_nvvs_process(want_running=True)
    assert running, "Nvvs process did not start within 10 seconds. pgrep output: %s" % debug_output
    start = time.time()
    
    # Give the test time to exit and verify that the test exits early
    # Test should exit within 75% of test duration if it is going to fail early. Ideally, it should exit within 
    # 2 failure checks (~ 4 seconds of test start), but we provide bigger buffer to account for delays in starting 
    # the test
    result_thread.join(20)
    test_exited_early = not result_thread.is_alive() # Cache thread isAlive value until we verify it
    end = time.time()

    # Stop the injection app
    inject_error.Stop()
    inject_error.join()
    # Verify injection app stopped correctly
    assert inject_error.retCode == dcgm_structs.DCGM_ST_OK, \
        "There was an error inserting values into dcgm. Return code: %s" % inject_error.retCode

    if not test_exited_early:
        # Wait for the launched diag to end
        result_thread.join()
        end = time.time()
    
    response = data[0]
    # Check whether test exited early
    assert test_exited_early, \
        "Expected %s test to exit early. Test took %ss to complete.\nGot result: %s (\ninfo: %s,\n warning: %s)" \
            % (test_name, (end - start),
               response.perGpuResponses[gpuId].results[testIndex].result,
               response.perGpuResponses[gpuId].results[testIndex].info,
               response.perGpuResponses[gpuId].results[testIndex].error.msg)

    # Verify the test failed
    assert check_diag_result_fail(response, gpuId, testIndex), \
        "Expected %s test to fail due to injected dbes.\nGot result: %s (\ninfo: %s,\n warning: %s)" % \
            (test_name, response.perGpuResponses[gpuId].results[testIndex].result,
             response.perGpuResponses[gpuId].results[testIndex].info,
             response.perGpuResponses[gpuId].results[testIndex].error.msg)

    ###
    # Rerun the test to verify that the test passes now that there are no inserted errors
    duration = 30
    paramsStr = "%s.test_duration=%s" % (test_name, duration)

    logger.info("Verifying that test passes once xid errors are removed.")
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr)
    dd.SetFailEarly(checkInterval=3) # enable fail early checks
    dd.SetDebugLogFile(logname % 4)
    # Reset dbes error
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS, 0, 0)
    # Sleep to ensure no pending errors left
    time.sleep(10)

    response = test_utils.diag_execute_wrapper(dd, handle)
    # Verify the test passed
    assert check_diag_result_pass(response, gpuId, testIndex), \
        "Expected %s test to pass because there are no dbes\nGot result: %s (\ninfo: %s,\n warning: %s)" % \
            (test_name, response.perGpuResponses[gpuId].results[testIndex].result,
             response.perGpuResponses[gpuId].results[testIndex].info,
             response.perGpuResponses[gpuId].results[testIndex].error.msg)
Пример #12
0
def helper_per_gpu_responses_dcgmi(handle, gpuIds, testName, testParams):
    """
    Verify that pass/fail status for diagnostic tests are reported on a per GPU basis via dcgmi (for both normal stdout 
    and JSON output).
    """
    def get_stdout(app):
        output = ''
        for line in app.stdout_lines:
            output = output + line + " "
        return output

    def print_output(app):
        logger.info(get_stdout(app))
        for line in app.stderr_lines:
            logger.error(line)

    def verify_successful_dcgmi_run(app):
        app.start(timeout=40)

        logger.info("Started dcgmi diag with pid %s" % app.getpid())
        retcode = app.wait()

        if test_utils.is_mig_incompatible_failure(get_stdout(app)):
            app.validate()
            test_utils.skip_test(
                "Skipping this test because MIG is configured incompatibly (preventing access to the whole GPU)"
            )

        # dcgm returns DCGM_ST_NVVS_ERROR on diag failure (which is expected here).
        expected_retcode = c_uint8(
            dcgm_structs.DCGM_ST_NVVS_ISOLATE_ERROR).value
        if retcode != expected_retcode:
            if app.stderr_lines or app.stdout_lines:
                logger.info("dcgmi output:")
                print_output(app)
        assert retcode == expected_retcode, \
            "Expected dcgmi diag to have retcode %s. Got return code %s" % (expected_retcode, retcode)
        app.validate()  # non-zero exit code must be validated

    #helper_verify_diag_passing(handle, gpuIds, useFakeGpus=True)

    # Setup injection app
    interval = 0.1
    fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
    insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN
    # Use an offset to make these errors start after the benign values
    inject_value(handle, gpuIds[0], fieldId, insertedError, injection_offset,
                 True)
    inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000,
                 injection_offset, True)
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(gpuIds[0], fieldId, insertedError, checkInterval=interval, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    # Verify dcgmi output
    gpuIdStrings = list(map(str, gpuIds))
    gpuList = ",".join(gpuIdStrings)
    args = [
        "diag", "-r", testName, "-p", testParams, "-f", gpuList,
        "--throttle-mask", "0"
    ]
    dcgmiApp = DcgmiApp(args=args)

    logger.info("Verifying stdout output")
    verify_successful_dcgmi_run(dcgmiApp)
    # Verify dcgmi output shows per gpu results (crude approximation of verifying correct console output)
    stress_header_found = False
    fail_gpu_found = False
    fail_gpu_text = "Fail - GPU: %s" % gpuIds[0]
    check_for_warning = False
    warning_found = False
    for line in dcgmiApp.stdout_lines:
        if not stress_header_found:
            if "Stress" not in line:
                continue
            stress_header_found = True
            continue
        if not fail_gpu_found:
            if fail_gpu_text not in line:
                continue
            fail_gpu_found = True
            check_for_warning = True
            continue
        if check_for_warning:
            if "Warning" in line:
                warning_found = True
            break

    if not (stress_header_found and fail_gpu_found and warning_found):
        logger.info("dcgmi output:")
        print_output(dcgmiApp)

    assert stress_header_found, "Expected to see 'Stress' header in output"
    assert fail_gpu_found, "Expected to see %s in output" % fail_gpu_text
    assert warning_found, "Expected to see 'Warning' in output after GPU failure text"

    inject_value(handle, gpuIds[0], fieldId, insertedError, injection_offset,
                 True)
    inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000,
                 injection_offset, True)
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(gpuIds[0], fieldId, insertedError, checkInterval=interval, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    # Verify JSON output
    logger.info("Verifying JSON output")
    args.append("-j")
    dcgmiApp = DcgmiApp(args=args)
    verify_successful_dcgmi_run(dcgmiApp)

    # Stop error insertion
    logger.info("Stopped error injection")

    # Verify per GPU results
    json_output = "\n".join(dcgmiApp.stdout_lines)
    output = json.loads(json_output)
    verifed = False
    if (len(output.get("DCGM GPU Diagnostic", {}).get("test_categories", []))
            == 2 and output["DCGM GPU Diagnostic"]["test_categories"][1].get(
                "category", None) == "Stress" and output["DCGM GPU Diagnostic"]
        ["test_categories"][1]["tests"][0]["name"] == testName
            and len(output["DCGM GPU Diagnostic"]["test_categories"][1]
                    ["tests"][0]["results"]) >= 2
            and output["DCGM GPU Diagnostic"]["test_categories"][1]["tests"][0]
        ["results"][0]["gpu_ids"] == str(gpuIds[0])
            and output["DCGM GPU Diagnostic"]["test_categories"][1]["tests"][0]
        ["results"][0]["status"] == "Fail" and output["DCGM GPU Diagnostic"]
        ["test_categories"][1]["tests"][0]["results"][1]["status"] == "Pass"):
        verifed = True

    if not verifed:
        print_output(dcgmiApp)

    assert verifed, "dcgmi JSON output did not pass verification"