def test_dcgmproftester_parallel_gpus(handle, gpuIds): ''' Test that we can successfully read dcgmproftester metrics multiple concurrent GPUs This tests a few things: 1. That metrics work for more than GPU 0 2. That metrics work for multiple GPUs at a time ''' if len(gpuIds) < 2: test_utils.skip_test("Skipping multi-GPU test since there's only one of this SKU") dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) cudaDriverVersion = test_utils.get_cuda_driver_version(handle, gpuIds[0]) #Graphics activity works for every GPU that supports DCP. It also works reliably even under heavy concurrecy fieldIds = "1001" args = ["--mode", "validate", "-d", "15.0", "-r", "1.0", "--sync-count", "5", "-w", "10", "-t", fieldIds] app = apps.DcgmProfTesterApp(cudaDriverMajorVersion=cudaDriverVersion[0], gpuIds=gpuIds, args=args) app.start(timeout=120.0 * len(gpuIds)) #Account for slow systems but still add an upper bound app.wait() app.validate() #Validate here so that errors are printed when they occur instead of at the end of the test
def helper_test_dpt_field_id(handle, gpuIds, fieldId, extraArgs = None): ''' Test that we can retrieve a valid FV for a profiling field immediately after watching ''' dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) cudaDriverVersion = test_utils.get_cuda_driver_version(handle, gpuIds[0]) supportedFieldIds = helper_get_supported_field_ids(dcgmGroup) # Just test the first GPU of our SKU. Other tests will cover multiple SKUs useGpuIds = [gpuIds[0], ] args = ["--target-max-value", "--no-dcgm-validation", "--dvs", "--reset", "--mode", "validate", "-d", "15.0", "-r", "1.0", "--sync-count", "5", "-w", "5", "-t", str(fieldId)] if extraArgs is not None: args.extend(extraArgs) app = apps.DcgmProfTesterApp(cudaDriverMajorVersion=cudaDriverVersion[0], gpuIds=useGpuIds, args=args) app.start(timeout=120.0 * len(gpuIds)) #Account for slow systems but still add an upper bound app.wait()
def helper_test_dpt_help(handle, gpuIds): ''' Test that command line --help argument works. ''' dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) cudaDriverVersion = test_utils.get_cuda_driver_version(handle, gpuIds[0]) supportedFieldIds = helper_get_supported_field_ids(dcgmGroup) #Just test the first GPU of our SKU. Other tests will cover multiple SKUs useGpuIds = [gpuIds[0], ] args = ["--help"] app = apps.DcgmProfTesterApp(cudaDriverMajorVersion=cudaDriverVersion[0], gpuIds=useGpuIds, args=args) app.start(timeout=120.0 * len(gpuIds)) #Account for slow systems but still add an upper bound app.wait()