def test_sdk_example_script_smoke_embedded_manual(): """ Smoke test ensuring that the example script for using dcgm does not fail for an embedded hostengine with manual operation mode """ env = {'PYTHONPATH': ':'.join(sys.path)} script = os.path.join(sdk_sample_scripts_path, 'dcgm_example.py') example = AppRunner(sys.executable, [script, '--opmode=manual', '--type=embedded'], env=env) example.run(timeout=SAMPLE_SCRIPT_TIMEOUT)
def verify_exit_code_on_signal(signum): # Ensure that host engine is ready to launch a new diagnostic dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr='1') success = False start = time.time() while not success and (time.time() - start) <= 3: try: response = test_utils.diag_execute_wrapper(dd, handle) success = True except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_DIAG_ALREADY_RUNNING): # Only acceptable error due to small race condition between the nvvs process exiting and # hostengine actually processing the exit. We try for a maximum of 3 seconds since this # should be rare and last only for a short amount of time time.sleep(1.5) diagApp = AppRunner(dcgmi_path, args=[ "diag", "-r", "SM Stress", "-i", "%s" % gpuId, "-d", "INFO", "--debugLogFile", "/tmp/nvvs.log" ]) # Start the diag diagApp.start(timeout=40) logger.info("Launched dcgmi process with pid: %s" % diagApp.getpid()) # Ensure diag is running before sending interrupt signal running, debug_output = dcgm_internal_helpers.check_nvvs_process( want_running=True, attempts=50) assert running, "The nvvs process did not start within 25 seconds: %s" % ( debug_output) # There is a small race condition here - it is possible that the hostengine sends a SIGTERM before the # nvvs process has setup a signal handler, and so the nvvs process does not stop when SIGTERM is sent. # We sleep for 1 second to reduce the possibility of this scenario time.sleep(1) diagApp.signal(signum) retCode = diagApp.wait() # Check the return code and stdout/stderr output before asserting for better debugging info if retCode == 0: logger.error("Got retcode '%s' from launched diag." % retCode) if diagApp.stderr_lines or diagApp.stdout_lines: logger.info("dcgmi output:") for line in diagApp.stdout_lines: logger.info(line) for line in diagApp.stderr_lines: logger.error(line) assert retCode != 0, "Expected a non-zero exit code, but got 0" # Since the app returns a non zero exit code, we call the validate method to prevent false # failures from the test framework diagApp.validate() # Give the launched nvvs process 15 seconds to terminate. not_running, debug_output = dcgm_internal_helpers.check_nvvs_process( want_running=False, attempts=50) assert not_running, "The launched nvvs process did not terminate within 25 seconds. pgrep output:\n%s" \ % debug_output
def createBlacklistApp(numGpus=None, numSwitches=None, testNames=None, instantaneous=False): args = ["./%s" % STANDALONE_BLACKLIST_SCRIPT_NAME] if numGpus == None or numSwitches == None: args.append("-d") else: args.append("-g") args.append(str(numGpus)) args.append("-s") args.append(str(numSwitches)) if instantaneous: args.append("-i") elif testNames: args.append("-r") args.append(testNames) else: args.append("-r") args.append("memory bandwidth") return AppRunner(sys.executable, args)