示例#1
0
def test_dcgmproftester_parallel_gpus(handle, gpuIds):
    '''
    Test that we can successfully read dcgmproftester metrics multiple concurrent GPUs

    This tests a few things:
    1. That metrics work for more than GPU 0
    2. That metrics work for multiple GPUs at a time
    '''
    if len(gpuIds) < 2:
        test_utils.skip_test("Skipping multi-GPU test since there's only one of this SKU")

    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    cudaDriverVersion = test_utils.get_cuda_driver_version(handle, gpuIds[0])

    #Graphics activity works for every GPU that supports DCP. It also works reliably even under heavy concurrecy
    fieldIds = "1001" 

    args = ["--mode", "validate", "-d", "15.0", "-r", "1.0", "--sync-count", "5", "-w", "10", "-t", fieldIds]
    app = apps.DcgmProfTesterApp(cudaDriverMajorVersion=cudaDriverVersion[0], gpuIds=gpuIds, args=args)
    app.start(timeout=120.0 * len(gpuIds)) #Account for slow systems but still add an upper bound
    app.wait()
    app.validate() #Validate here so that errors are printed when they occur instead of at the end of the test
示例#2
0
def helper_test_dpt_field_id(handle, gpuIds, fieldId, extraArgs = None):
    '''
    Test that we can retrieve a valid FV for a profiling field immediately after watching
    '''
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    cudaDriverVersion = test_utils.get_cuda_driver_version(handle, gpuIds[0])

    supportedFieldIds = helper_get_supported_field_ids(dcgmGroup)

    # Just test the first GPU of our SKU. Other tests will cover multiple SKUs
    useGpuIds = [gpuIds[0], ]

    args = ["--target-max-value", "--no-dcgm-validation", "--dvs", "--reset", "--mode", "validate", "-d", "15.0", "-r", "1.0", "--sync-count", "5", "-w", "5", "-t", str(fieldId)]

    if extraArgs is not None:
        args.extend(extraArgs)

    app = apps.DcgmProfTesterApp(cudaDriverMajorVersion=cudaDriverVersion[0], gpuIds=useGpuIds, args=args)
    app.start(timeout=120.0 * len(gpuIds)) #Account for slow systems but still add an upper bound
    app.wait()
示例#3
0
def helper_test_dpt_help(handle, gpuIds):
    '''
    Test that command line --help argument works.
    '''
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    cudaDriverVersion = test_utils.get_cuda_driver_version(handle, gpuIds[0])

    supportedFieldIds = helper_get_supported_field_ids(dcgmGroup)

    #Just test the first GPU of our SKU. Other tests will cover multiple SKUs
    useGpuIds = [gpuIds[0], ]

    args = ["--help"]
    app = apps.DcgmProfTesterApp(cudaDriverMajorVersion=cudaDriverVersion[0], gpuIds=useGpuIds, args=args)
    app.start(timeout=120.0 * len(gpuIds)) #Account for slow systems but still add an upper bound
    app.wait()