def helper_check_software_page_retirements_fail_on_pending_retirements(handle, gpuId): """ Ensure that the software test for page retirements fails when there are pending page retirements. """ # First verify that the software test passes for the gpu. # If it doesn't pass, skip test and add note to check GPU health dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId]) dd.UseFakeGpus() response = test_utils.diag_execute_wrapper(dd, handle) if not check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT): test_utils.skip_test("Skipping because GPU %s does not pass software page retirement test. " "Please verify whether the GPU is healthy." % gpuId) # Inject some pending page retirements inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING, 1, -30, True) response = test_utils.diag_execute_wrapper(dd, handle) # Ensure software test failed due to pending page retirments assert check_software_result_fail(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \ "Expected software test to fail due to pending page retirements in the GPU" # Reset injected value inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING, 0, -30, True) # Ensure diag passes now response = test_utils.diag_execute_wrapper(dd, handle) assert check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \ "Expected software test to pass"
def helper_check_software_page_retirements_fail_total_retirements(handle, gpuId): """ Ensure that the software test for page retirements fails when there are mroe than 60 page retirements. """ # First verify that the software test passes for the gpu. If it doesn't pass, skip test and add note to check GPU health dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId]) dd.UseFakeGpus() response = test_utils.diag_execute_wrapper(dd, handle) if not check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT): test_utils.skip_test("Skipping because GPU %s does not pass software page retirement test. " "Please verify whether the GPU is healthy." % gpuId) # Inject enough page retirements to cause failure inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, 33, -30, True) inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, 33, -30, True) response = test_utils.diag_execute_wrapper(dd, handle) assert check_software_result_fail(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \ "Expected software test to fail due to 60 total page retirements in the GPU" # Ensure 59 pages pass injected value inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, 25, -30, True) # Ensure diag passes now response = test_utils.diag_execute_wrapper(dd, handle) assert check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \ "Expected software test to pass since there are less than 60 total retired pages" # Reset retired pages count and verify pass inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, 0, -30, True) inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, 0, -30, True) # Ensure diag still passes response = test_utils.diag_execute_wrapper(dd, handle) assert check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \ "Expected software test to pass since there are no retired pages"
def helper_throttling_masking_failures(handle, gpuId): ##### # First check whether the GPU is healthy dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr="SM Stress", paramsStr="sm stress.test_duration=2", version=dcgm_structs.dcgmRunDiag_version) dd.SetThrottleMask( 0 ) # We explicitly want to fail for throttle reasons since this test inserts throttling errors # for verification dd.UseFakeGpus() response = test_utils.diag_execute_wrapper(dd, handle) if not check_diag_result_pass(response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX): test_utils.skip_test( "Skipping because GPU %s does not pass SM Perf test. " "Please verify whether the GPU is supported and healthy." % gpuId) ##### dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr="SM Stress", paramsStr="sm stress.test_duration=15", version=dcgm_structs.dcgmRunDiag_version) dd.SetThrottleMask(0) dd.UseFakeGpus() fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN interval = 0.1 logger.info("Injecting benign errors") inject_value(handle, gpuId, fieldId, 3, 1, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, 3, checkInterval=interval, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" logger.info("Injecting actual errors") inject_value(handle, gpuId, fieldId, insertedError, injection_offset, True) inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000, injection_offset, True) logger.info("Started diag") response = test_utils.diag_execute_wrapper(dd, handle) # Verify that the inserted values are visible in DCGM # Max wait of 8 is because of 5 second offset + 2 seconds required for 20 matches + 1 second buffer. assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, insertedError, checkInterval=0.1, numMatches=1, maxWait=8), \ "Expected inserted errors to be visible in DCGM" throttled, errMsg = find_throttle_failure( response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX) assert throttled, "Expected to find throttling failure, but did not: (%s)" % errMsg
def perform_diag_with_throttle_mask_and_verify(dd, handle, gpuId, inserted_error, throttle_mask, shouldPass, failureMsg): fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS interval = 0.1 if throttle_mask is not None: dd.SetThrottleMask(throttle_mask) inject_value(handle, gpuId, fieldId, inserted_error, injection_offset, True) inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000, injection_offset, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, inserted_error, checkInterval=interval, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" # Start the diag response = test_utils.diag_execute_wrapper(dd, handle) # Check for pass or failure as per the shouldPass parameter throttled, errMsg = find_throttle_failure( response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX) if shouldPass: assert throttled == False, "Expected to not have a throttling error but found %s" % errMsg else: assert throttled == True, "Expected to find a throttling error but did not (%s)" % errMsg
def verify_exit_code_on_signal(signum): # Ensure that host engine is ready to launch a new diagnostic dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr='1') success = False start = time.time() while not success and (time.time() - start) <= 3: try: response = test_utils.diag_execute_wrapper(dd, handle) success = True except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_DIAG_ALREADY_RUNNING): # Only acceptable error due to small race condition between the nvvs process exiting and # hostengine actually processing the exit. We try for a maximum of 3 seconds since this # should be rare and last only for a short amount of time time.sleep(1.5) diagApp = AppRunner(dcgmi_path, args=[ "diag", "-r", "SM Stress", "-i", "%s" % gpuId, "-d", "INFO", "--debugLogFile", "/tmp/nvvs.log" ]) # Start the diag diagApp.start(timeout=40) logger.info("Launched dcgmi process with pid: %s" % diagApp.getpid()) # Ensure diag is running before sending interrupt signal running, debug_output = dcgm_internal_helpers.check_nvvs_process( want_running=True, attempts=50) assert running, "The nvvs process did not start within 25 seconds: %s" % ( debug_output) # There is a small race condition here - it is possible that the hostengine sends a SIGTERM before the # nvvs process has setup a signal handler, and so the nvvs process does not stop when SIGTERM is sent. # We sleep for 1 second to reduce the possibility of this scenario time.sleep(1) diagApp.signal(signum) retCode = diagApp.wait() # Check the return code and stdout/stderr output before asserting for better debugging info if retCode == 0: logger.error("Got retcode '%s' from launched diag." % retCode) if diagApp.stderr_lines or diagApp.stdout_lines: logger.info("dcgmi output:") for line in diagApp.stdout_lines: logger.info(line) for line in diagApp.stderr_lines: logger.error(line) assert retCode != 0, "Expected a non-zero exit code, but got 0" # Since the app returns a non zero exit code, we call the validate method to prevent false # failures from the test framework diagApp.validate() # Give the launched nvvs process 15 seconds to terminate. not_running, debug_output = dcgm_internal_helpers.check_nvvs_process( want_running=False, attempts=50) assert not_running, "The launched nvvs process did not terminate within 25 seconds. pgrep output:\n%s" \ % debug_output
def test_memtest_failures_standalone(handle, gpuIds): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr="memtest", paramsStr="memtest.test_duration=10") inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, 1000, injection_offset, True) response = test_utils.diag_execute_wrapper(dd, handle) assert response.perGpuResponses[gpuIds[0]].results[dcgm_structs.DCGM_MEMTEST_INDEX].result != dcgm_structs.DCGM_DIAG_RESULT_PASS, \ "Should have a failure due to injected DBEs, but got passing result"
def helper_verify_log_file_creation(handle, gpuIds): dd = helper_verify_diag_passing( handle, gpuIds, testNames="targeted stress", testIndex=dcgm_structs.DCGM_TARGETED_STRESS_INDEX, params="targeted stress.test_duration=10", useFakeGpus=True) logname = '/tmp/tmp_test_debug_log' dd.SetDebugLogFile(logname) dd.SetDebugLevel(5) response = test_utils.diag_execute_wrapper(dd, handle) if len(response.systemError.msg) == 0: skippedAll = True passedCount = 0 errors = "" for gpuId in gpuIds: resultType = response.perGpuResponses[gpuId].results[ dcgm_structs.DCGM_TARGETED_STRESS_INDEX].result if resultType not in [ dcgm_structs.DCGM_DIAG_RESULT_SKIP, dcgm_structs.DCGM_DIAG_RESULT_NOT_RUN ]: skippedAll = False if resultType == dcgm_structs.DCGM_DIAG_RESULT_PASS: passedCount = passedCount + 1 else: warning = response.perGpuResponses[gpuId].results[ dcgm_structs.DCGM_TARGETED_STRESS_INDEX].error.msg if len(warning): errors = "%s, GPU %d failed: %s" % (errors, gpuId, warning) if skippedAll == False: detailedMsg = "passed on %d of %d GPUs" % (passedCount, response.gpuCount) if len(errors): detailedMsg = "%s and had these errors: %s" % (detailedMsg, errors) logger.info(detailedMsg) assert os.path.isfile( logname), "Logfile '%s' was not created and %s" % (logname, detailedMsg) else: logger.info( "The diagnostic was skipped, so we cannot run this test.") else: logger.info( "The diagnostic had a problem when executing, so we cannot run this test." )
def helper_test_bad_statspath(handle, gpuIds): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr='diagnostic', paramsStr='diagnostic.test_duration=20') dd.SetStatsPath('/fake/superfake/notreal/') failed = False try: response = test_utils.diag_execute_wrapper(dd, handle) except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NVVS_ERROR) as e: failed = True assert str(e).find( 'cannot access statspath' ) != -1, "Should have received a statspath error but got %s" % str(e) assert failed, "We must fail when attempting to access a fake dir" filename = '/tmp/not_a_file' if not os.path.isfile(filename): # create the file with open(filename, 'w') as f: f.write('lorem ipsum') failed = False dd.SetStatsPath(filename) try: response = test_utils.diag_execute_wrapper(dd, handle) except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NVVS_ERROR) as e: failed = True assert str(e).find( 'is not a directory' ) != -1, "Should have received a statspath error but got %s" % str( e) assert failed, "We must fail when attempting to set statspath to a file" # Remove the file to clean up after ourselves os.remove(filename)
def helper_test_diagnostic_config_usage(handle, gpuIds): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr="diagnostic", paramsStr="diagnostic.test_duration=10") dd.SetConfigFileContents( "%YAML 1.2\n\ncustom:\n- custom:\n diagnostic:\n max_sbe_errors: 1" ) inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, 1000, injection_offset, True) response = test_utils.diag_execute_wrapper(dd, handle) assert response.perGpuResponses[gpuIds[0]].results[dcgm_structs.DCGM_DIAGNOSTIC_INDEX].result != dcgm_structs.DCGM_DIAG_RESULT_PASS, \ "Should have a failure due to injected SBEs, but got passing result"
def helper_test_dcgm_short_diagnostic_run(handle, gpuIds): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr="diagnostic", paramsStr="diagnostic.test_duration=15") response = test_utils.diag_execute_wrapper(dd, handle) for gpuId in gpuIds: if response.perGpuResponses[gpuId].results[ dcgm_structs. DCGM_DIAGNOSTIC_INDEX].result == dcgm_structs.DCGM_DIAG_RESULT_SKIP: logger.info( "Got status DCGM_DIAG_RESULT_SKIP for gpuId %d. This is expected if this GPU does not support the Diagnostic test." % gpuId) continue assert response.perGpuResponses[gpuId].results[dcgm_structs.DCGM_DIAGNOSTIC_INDEX].result == dcgm_structs.DCGM_DIAG_RESULT_PASS, \ "Should have passed the 15 second diagnostic for all GPUs"
def helper_verify_diag_passing(handle, gpuIds, testNames="SM Stress", testIndex=dcgm_structs.DCGM_SM_STRESS_INDEX, params="sm stress.test_duration=15", version=dcgm_structs.dcgmRunDiag_version, useFakeGpus=False): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr=testNames, paramsStr=params, version=version) dd.SetThrottleMask( 0 ) # We explicitly want to fail for throttle reasons since this test inserts throttling errors # for verification if useFakeGpus: dd.UseFakeGpus() # If we've already chchecked this GPU, then use the previous result runDiag = False for gpuId in gpuIds: if gpuId in checked_gpus: if checked_gpus[gpuId] == False: test_utils.skip_test( "Skipping because GPU %s does not pass SM Perf test. " "Please verify whether the GPU is supported and healthy." % gpuId) else: runDiag = True if runDiag == False: return dd response = test_utils.diag_execute_wrapper(dd, handle) for gpuId in gpuIds: if not check_diag_result_pass(response, gpuId, testIndex): checked_gpus[gpuId] = False test_utils.skip_test( "Skipping because GPU %s does not pass SM Perf test. " "Please verify whether the GPU is supported and healthy." % gpuId) else: checked_gpus[gpuId] = True return dd
def helper_per_gpu_responses_api(handle, gpuIds, testDir): """ Verify that pass/fail status for diagnostic tests are reported on a per GPU basis via dcgmActionValidate API call """ failGpuId = gpuIds[0] dd = helper_verify_diag_passing(handle, gpuIds, useFakeGpus=True) dd = DcgmDiag.DcgmDiag(gpuIds=[failGpuId], testNamesStr="SM Stress", paramsStr="sm stress.test_duration=15", version=dcgm_structs.dcgmRunDiag_version) dd.SetThrottleMask( 0 ) # We explicitly want to fail for throttle reasons since this test inserts throttling errors # for verification dd.UseFakeGpus() dd.SetStatsPath(testDir) dd.SetStatsOnFail(1) # Setup injection app fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN interval = 0.1 # Use an offset to make these errors start after the benign values inject_value(handle, failGpuId, fieldId, insertedError, injection_offset, True) inject_value(handle, failGpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000, injection_offset, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(failGpuId, fieldId, insertedError, checkInterval=interval, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" response = test_utils.diag_execute_wrapper(dd, handle) logger.info("Started diag") # Verify that responses are reported on a per gpu basis. Ensure the first GPU failed, and all others passed for gpuId in gpuIds: throttled, errMsg = find_throttle_failure( response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX) if gpuId == failGpuId: assert throttled, "Expected throttling error but found none (%s)" % errMsg else: assert not throttled, "Expected not to find a throttling error but found '%s'" % errMsg
def helper_check_diag_high_temp_fail(handle, gpuIds): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr='diagnostic', paramsStr='diagnostic.test_duration=10') # kick off a thread to inject the failing value while I run the diag diag_thread = threading.Thread( target=injection_wrapper, args=[handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 120, True]) diag_thread.start() response = test_utils.diag_execute_wrapper(dd, handle) diag_thread.join() assert response.gpuCount == len( gpuIds), "Expected %d gpus, but found %d reported" % ( len(gpuIds), response.gpuCount) diag_result_assert_fail( response, gpuIds[0], dcgm_structs.DCGM_DIAGNOSTIC_INDEX, "Expected a failure due to 120 degree inserted temp.", dcgm_errors.DCGM_FR_TEMP_VIOLATION)
def helper_check_diag_thermal_violation(handle, gpuIds): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr='diagnostic', paramsStr='diagnostic.test_duration=10') # kick off a thread to inject the failing value while I run the diag diag_thread = threading.Thread( target=injection_wrapper, args=[ handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION, 9223372036854775792, True ]) diag_thread.start() response = test_utils.diag_execute_wrapper(dd, handle) diag_thread.join() assert response.gpuCount == len( gpuIds), "Expected %d gpus, but found %d reported" % ( len(gpuIds), response.gpuCount) for gpuIndex in range(response.gpuCount): diag_assert_error_not_found(response, gpuIndex, dcgm_structs.DCGM_DIAGNOSTIC_INDEX, "Thermal violations")
def runDiag(dd, data): # Simple helper method to run a diag (used as thread target) data[0] = test_utils.diag_execute_wrapper(dd, handle)
def verify_early_fail_checks_for_test(handle, gpuId, test_name, testIndex): """ Helper method for verifying the fail early checks for the specified test. """ if testIndex == dcgm_structs.DCGM_TARGETED_POWER_INDEX and not option_parser.options.developer_mode: # Skip this test since Targeted Power always fails when duration is less than 30 seconds test_utils.skip_test("Skipping fail early verification for Targeted Power test. Use developer mode " "to run this test.") duration = 2 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min # duration requirements for Targeted Power paramsStr = "%s.test_duration=%s" % (test_name, duration) data = [None] def runDiag(dd, data): # Simple helper method to run a diag (used as thread target) data[0] = test_utils.diag_execute_wrapper(dd, handle) ### # First verify that the given test passes for the gpu. # If it doesn't pass, skip test and add note to check GPU health logger.info("Checking whether %s test passes on GPU %s" % (test_name, gpuId)) dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr) test_name_no_spaces = test_name.replace(" ", "_") logname = '/tmp/nv_' + test_name_no_spaces + '%s.log' dd.SetDebugLogFile(logname % 1) dd.SetDebugLevel(5) response = test_utils.diag_execute_wrapper(dd, handle) if not check_diag_result_pass(response, gpuId, testIndex): test_utils.skip_test("Skipping because GPU %s does not pass %s test. " "Please verify whether the GPU is healthy." % (gpuId, test_name)) ### # Next, verify that the given test passes for the gpu when fail early checks are enabled and no errors are inserted logger.info("Checking whether %s test passes on GPU %s with fail early enabled" % (test_name, gpuId)) duration = 15 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min # duration requirements for Targeted Power paramsStr = "%s.test_duration=%s" % (test_name, duration) dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr) dd.SetFailEarly(checkInterval=2) # enable fail early checks dd.SetDebugLogFile(logname % 2) dd.SetDebugLevel(5) result_thread = threading.Thread(target=runDiag, args=[dd, data]) result_thread.start() # Ensure nvvs process has started running, debug_output = dcgm_internal_helpers.check_nvvs_process(want_running=True) assert running, "Nvvs process did not start within 10 seconds. pgrep output: %s" % debug_output start = time.time() result_thread.join() end = time.time() assert check_diag_result_pass(data[0], gpuId, testIndex), \ "Expected %s test to pass with fail early enabled and no inserted errors" % test_name assert (end - start) >= duration * 0.9, \ "Expected %s test to run for at least %ss, but it only ran for %ss." % (test_name, duration, end - start) ### # Verify fail early behavior by inserting an error. # Setup test parameters duration = 20 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min # duration requirements for Targeted Power paramsStr = "%s.test_duration=%s" % (test_name, duration) response = None dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr) dd.SetFailEarly(checkInterval=2) # enable fail early checks dd.SetDebugLogFile(logname % 3) # Setup threads / processes xid_inject_val = 2 result_thread = threading.Thread(target=runDiag, args=[dd, data]) inject_error = dcgm_internal_helpers.InjectionThread(handle, gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS, xid_inject_val, offset=5) logger.info("Verifying fail early behavior for %s test by inserting XIDs." % test_name) # Start inserting errors inject_error.start() # Ensure that inserted errors are visible assert \ dcgm_internal_helpers.verify_field_value(gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS, xid_inject_val, checkInterval=0.1, numMatches=5), \ "Expected inserted value for XIDs to be visible in DCGM" # Start test thread result_thread.start() # Ensure nvvs process has started running, debug_output = dcgm_internal_helpers.check_nvvs_process(want_running=True) assert running, "Nvvs process did not start within 10 seconds. pgrep output: %s" % debug_output start = time.time() # Give the test time to exit and verify that the test exits early # Test should exit within 75% of test duration if it is going to fail early. Ideally, it should exit within # 2 failure checks (~ 4 seconds of test start), but we provide bigger buffer to account for delays in starting # the test result_thread.join(20) test_exited_early = not result_thread.is_alive() # Cache thread isAlive value until we verify it end = time.time() # Stop the injection app inject_error.Stop() inject_error.join() # Verify injection app stopped correctly assert inject_error.retCode == dcgm_structs.DCGM_ST_OK, \ "There was an error inserting values into dcgm. Return code: %s" % inject_error.retCode if not test_exited_early: # Wait for the launched diag to end result_thread.join() end = time.time() response = data[0] # Check whether test exited early assert test_exited_early, \ "Expected %s test to exit early. Test took %ss to complete.\nGot result: %s (\ninfo: %s,\n warning: %s)" \ % (test_name, (end - start), response.perGpuResponses[gpuId].results[testIndex].result, response.perGpuResponses[gpuId].results[testIndex].info, response.perGpuResponses[gpuId].results[testIndex].error.msg) # Verify the test failed assert check_diag_result_fail(response, gpuId, testIndex), \ "Expected %s test to fail due to injected dbes.\nGot result: %s (\ninfo: %s,\n warning: %s)" % \ (test_name, response.perGpuResponses[gpuId].results[testIndex].result, response.perGpuResponses[gpuId].results[testIndex].info, response.perGpuResponses[gpuId].results[testIndex].error.msg) ### # Rerun the test to verify that the test passes now that there are no inserted errors duration = 30 paramsStr = "%s.test_duration=%s" % (test_name, duration) logger.info("Verifying that test passes once xid errors are removed.") dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr) dd.SetFailEarly(checkInterval=3) # enable fail early checks dd.SetDebugLogFile(logname % 4) # Reset dbes error inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS, 0, 0) # Sleep to ensure no pending errors left time.sleep(10) response = test_utils.diag_execute_wrapper(dd, handle) # Verify the test passed assert check_diag_result_pass(response, gpuId, testIndex), \ "Expected %s test to pass because there are no dbes\nGot result: %s (\ninfo: %s,\n warning: %s)" % \ (test_name, response.perGpuResponses[gpuId].results[testIndex].result, response.perGpuResponses[gpuId].results[testIndex].info, response.perGpuResponses[gpuId].results[testIndex].error.msg)
def test_nvvs_plugin_software_inforom_embedded(handle, gpuIds): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr="short") response = test_utils.diag_execute_wrapper(dd, handle) for gpuId in gpuIds: result = response.levelOneResults[dcgm_structs.DCGM_SWTEST_INFOROM].result assert(result == dcgm_structs.DCGM_DIAG_RESULT_PASS or result == dcgm_structs.DCGM_DIAG_RESULT_SKIP)
def helper_test_stats_file_basics(handle, gpuIds, statsAsString, pluginName, pluginIndex, statName=None): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr=pluginName, paramsStr='%s.test_duration=20' % pluginName) # was 20 dd.SetStatsPath('/tmp/') # Make sure a stats file was created statsfile = '/tmp/stats_%s.json' % (pluginName.replace(' ', '_')) if statsAsString == True: dd.SetConfigFileContents( "%YAML 1.2\n\nglobals:\n logfile_type: text\n") response = test_utils.diag_execute_wrapper(dd, handle) skippedAll = True try: if len(response.systemError.msg) == 0: passedCount = 0 errors = "" for gpuIndex in range(response.gpuCount): resultType = response.perGpuResponses[gpuIndex].results[ pluginIndex].result if resultType != dcgm_structs.DCGM_DIAG_RESULT_SKIP \ and resultType != dcgm_structs.DCGM_DIAG_RESULT_NOT_RUN: skippedAll = False if resultType == dcgm_structs.DCGM_DIAG_RESULT_PASS: passedCount = passedCount + 1 else: warning = response.perGpuResponses[gpuIndex].results[ pluginIndex].error.msg if len(warning): errors = "%s GPU %d failed: %s" % ( errors, gpuIndex, warning) if skippedAll == False and passedCount > 0: detailedMsg = "passed on %d of %d GPUs" % (passedCount, response.gpuCount) if len(errors): detailedMsg = "%s and had these errors: %s" % (detailedMsg, errors) logger.info("%s when running the %s plugin" % (detailedMsg, pluginName)) assert os.path.isfile( statsfile ), "Statsfile '%s' was not created as expected and %s" % ( statsfile, detailedMsg) if not statsAsString: helper_basic_stats_file_check(statsfile, gpuIds, statName) elif passedCount == 0: test_utils.skip_test( "Unable to pass any of these short runs for plugin %s." % pluginName) else: test_utils.skip_test( "The %s plugin was skipped, so we cannot run this test." % pluginName) else: test_utils.skip_test( "The %s plugin had a problem when executing, so we cannot run this test." % pluginName) finally: if os.path.exists(statsfile): os.remove(statsfile)
def run(dd, response): response = test_utils.diag_execute_wrapper(dd, handle)
def helper_check_diag_stop_on_interrupt_signals(handle, gpuId): """ Verifies that a launched diag is stopped when the dcgmi executable recieves a SIGINT, SIGHUP, SIGQUIT, or SIGTERM signal. """ # First check whether the GPU is healthy/supported dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr="SM Stress", paramsStr="sm stress.test_duration=2", version=dcgm_structs.dcgmRunDiag_version7) response = test_utils.diag_execute_wrapper(dd, handle) if not check_diag_result_pass(response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX): test_utils.skip_test( "Skipping because GPU %s does not pass SM Stress test. " "Please verify whether the GPU is supported and healthy." % gpuId) # paths to dcgmi executable paths = { "Linux_32bit": "./apps/x86/dcgmi", "Linux_64bit": "./apps/amd64/dcgmi", "Linux_ppc64le": "./apps/ppc64le/dcgmi", "Linux_aarch64": "./apps/aarch64/dcgmi" } # Verify test is running on a supported platform if utils.platform_identifier not in paths: test_utils.skip_test("Dcgmi is not supported on the current platform.") dcgmi_path = paths[utils.platform_identifier] def verify_exit_code_on_signal(signum): # Ensure that host engine is ready to launch a new diagnostic dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr='1') success = False start = time.time() while not success and (time.time() - start) <= 3: try: response = test_utils.diag_execute_wrapper(dd, handle) success = True except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_DIAG_ALREADY_RUNNING): # Only acceptable error due to small race condition between the nvvs process exiting and # hostengine actually processing the exit. We try for a maximum of 3 seconds since this # should be rare and last only for a short amount of time time.sleep(1.5) diagApp = AppRunner(dcgmi_path, args=[ "diag", "-r", "SM Stress", "-i", "%s" % gpuId, "-d", "INFO", "--debugLogFile", "/tmp/nvvs.log" ]) # Start the diag diagApp.start(timeout=40) logger.info("Launched dcgmi process with pid: %s" % diagApp.getpid()) # Ensure diag is running before sending interrupt signal running, debug_output = dcgm_internal_helpers.check_nvvs_process( want_running=True, attempts=50) assert running, "The nvvs process did not start within 25 seconds: %s" % ( debug_output) # There is a small race condition here - it is possible that the hostengine sends a SIGTERM before the # nvvs process has setup a signal handler, and so the nvvs process does not stop when SIGTERM is sent. # We sleep for 1 second to reduce the possibility of this scenario time.sleep(1) diagApp.signal(signum) retCode = diagApp.wait() # Check the return code and stdout/stderr output before asserting for better debugging info if retCode == 0: logger.error("Got retcode '%s' from launched diag." % retCode) if diagApp.stderr_lines or diagApp.stdout_lines: logger.info("dcgmi output:") for line in diagApp.stdout_lines: logger.info(line) for line in diagApp.stderr_lines: logger.error(line) assert retCode != 0, "Expected a non-zero exit code, but got 0" # Since the app returns a non zero exit code, we call the validate method to prevent false # failures from the test framework diagApp.validate() # Give the launched nvvs process 15 seconds to terminate. not_running, debug_output = dcgm_internal_helpers.check_nvvs_process( want_running=False, attempts=50) assert not_running, "The launched nvvs process did not terminate within 25 seconds. pgrep output:\n%s" \ % debug_output # Verify return code on SIGINT # We simply verify the return code because explicitly checking whether the nvvs process has terminated is # clunky and error-prone logger.info("Testing stop on SIGINT") verify_exit_code_on_signal(signal.SIGINT) # Verify return code on SIGHUP logger.info("Testing stop on SIGHUP") verify_exit_code_on_signal(signal.SIGHUP) # Verify return code on SIGQUIT logger.info("Testing stop on SIGQUIT") verify_exit_code_on_signal(signal.SIGQUIT) # Verify return code on SIGTERM logger.info("Testing stop on SIGTERM") verify_exit_code_on_signal(signal.SIGTERM)