Пример #1
0
def _assert_valid_dcgmi_results(args, retValue, stdout_lines, stderr_lines):
    assert (len(stdout_lines) >
            0), 'No output detected for args "%s"' % ' '.join(args[1:])

    if _is_eris_diag_inforom_failure(args, stdout_lines):
        # If we see inforom corruption, the test should not fail
        test_utils.skip_test('Detected corrupt inforom for diag test')
        return

    if retValue != c_ubyte(dcgm_structs.DCGM_ST_OK).value:
        if retValue == c_ubyte(dcgm_structs.DCGM_ST_NVVS_ERROR).value:
            # DCGM_ST_NVVS_ERROR means NVVS ran but returned a bad result. In other words, the arguments were
            # valid and this return code means you had valid arguments.
            return
        logger.error(
            'Valid test - Function returned error code: %s . Args used: "%s"' %
            (retValue, ' '.join(args[1:])))
        logger.error('Stdout:')
        for line in stdout_lines:
            logger.error('\t' + line)
        logger.error('Stderr:')
        for line in stderr_lines:
            logger.error('\t' + line)
        assert False, "See errors above."

    errLines = _lines_with_errors(stdout_lines)
    assert len(
        errLines
    ) == 0, "Found errors in output.  Offending lines: \n%s" % '\n'.join(
        errLines)
Пример #2
0
def test_dcgm_stub_library(handle):
    """ 
    Verifies that DCGM fails gracefully using the stub library
    if a proper DCGM installation is not present or shared 
    libraries are not included in the library search path
    """

    if utils.is_esx_hypervisor_system():
        test_utils.skip_test(
            "Compute Mode tests are not supported in VMware ESX Environments")

    if is_dcgm_package_installed():
        test_utils.skip_test(
            "A DCGM package is already installed on this machine")

    # Checks if libdcgm.so.2 is set within LD_LIBRARY_PATH
    libdcgm_path = get_libdcgm_path()
    assert libdcgm_path is not None

    if libdcgm_path is not None:
        # Verify is stub library is present
        if not (os.path.isfile(libdcgm_path + "/libdcgm_stub.a")):
            test_utils.skip_test("Unable to find \"libdcgm_stub.a\" in %s" %
                                 libdcgm_path)
        else:
            dcgm_lib_original = libdcgm_path + "/libdcgm.so.2"
            dcgm_lib_modified = dcgm_lib_original + "_modified"
    else:
        # Tear down the environment by finding and renaming "libdcgm.so.2" to "libdcgm.so.2_orig"
        # gets the path to libdcgm.so.2, like: /usr/lib/libdcgm.so.2
        try:
            dcgm_lib = filter(lambda x: "libdcgm.so.2" in x,
                              check_output(["ldconfig", "-p"]).split("\n"))
            dcgm_lib_original = filter(lambda x: x[0] != " ",
                                       dcgm_lib[0].split("=>")[-1])
            dcgm_lib_modified = filter(lambda x: x[0] != " ",
                                       dcgm_lib_original + "_modified")
        except:
            test_utils.skip_test("Unable to find libdcgm.so.2 library")

    # Renaming the file
    try:
        os.rename(dcgm_lib_original, dcgm_lib_modified)
    except:
        test_utils.skip_test("Unable to rename libdcgm.so.2 library")

    try:
        stub_app = apps.DcgmStubRunnerApp()
        stub_app.start()
        pid = stub_app.getpid()
        stub_app.wait()
    finally:
        # Restore environment
        os.rename(dcgm_lib_modified, dcgm_lib_original)
        logger.info("stub_library_tet PID was %d" % pid)

    assert "!!!!!!!!" in stub_app.stdout_lines[
        1], "Failed to collect stub library output"
    assert "WARNING:" in stub_app.stdout_lines[
        2], "Failed to collect stub library output"
Пример #3
0
def test_linting_create_python_path_env_var():
    '''
    A unit test for linting.py's _create_python_path_env_var function.
    This tests that given a number of python filepaths it returns a python path
    that only includes the topmost directories containing python files.
    '''

    if not option_parser.options.lint:
        test_utils.skip_test("not supported when the \"--no-lint\" option is used")

    filepaths = [
        '/a/file1.py',
        '/a/file2.py',
        
        '/a2/b/file3.py',
        '/a2/b/c/file4.py',
        '/a2/b/c/d/file5.py'
    ]
    
    expectedPyPath = ':'.join([
        '/a',
        '/a2/b',
    ])
    
    actualPyPath = linting._create_python_path_env_var(filepaths)
    
    assert(actualPyPath == expectedPyPath), (
        'actual python path:\n%s\n' % actualPyPath
        + 'not the same as expected:\n%s' % expectedPyPath)  
Пример #4
0
def test_dcgm_embedded_metadata_exectime_get_all_fields_sane(handle):
    """
    Sanity test for API that gets execution time of all fields together
    """
    if not option_parser.options.developer_mode:
        test_utils.skip_test("Skipping developer test.")
    handle = pydcgm.DcgmHandle(handle)
    system = pydcgm.DcgmSystem(handle)
    group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT)
    
    # watch a ton of fields so that we know that some are being stored
    updateFreqUsec = 1000
    test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=updateFreqUsec)
    system.introspect.UpdateAll()
    
    execTime = system.introspect.execTime.GetForAllFields().aggregateInfo
    
    perGpuSane = 300*1000 # 300 ms
    activeGpuCount = test_utils.get_live_gpu_count(handle.handle)
    saneLimit = perGpuSane*activeGpuCount
    
    # test that all struct fields in the API response have reasonable values
    assert(100 < execTime.totalEverUpdateUsec < saneLimit), (
        'execution time seems way too long for a system with %s gpus. Took %s ms. Sane limit: %s ms' 
        % (activeGpuCount, execTime.totalEverUpdateUsec/1000, saneLimit/1000))
    
    assert(100 < execTime.recentUpdateUsec < saneLimit), (
        'recent update time seems way too long for a system with %s gpus. Took %s ms. Sane limit: %s ms' 
        % (activeGpuCount, execTime.recentUpdateUsec/1000, saneLimit/1000))
    
    assert(updateFreqUsec-1 <= execTime.meanUpdateFreqUsec <= updateFreqUsec+1), execTime.meanUpdateFreqUsec
Пример #5
0
def test_dcgm_diag_handle_concurrency_standalone(handle, gpuIds):
    '''
    Test that we can use a DCGM handle concurrently with a diagnostic running
    '''
    diagDuration = 10

    gpuId = gpuIds[0]

    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId],
                           testNamesStr="SM Stress",
                           paramsStr="sm stress.test_duration=%d" %
                           diagDuration,
                           version=dcgm_structs.dcgmRunDiag_version)

    dd.UseFakeGpus()

    response = [None]

    def run(dd, response):
        response = test_utils.diag_execute_wrapper(dd, handle)

    diagStartTime = time.time()
    threadObj = threading.Thread(target=run, args=[dd, response])
    threadObj.start()

    #Give threadObj a head start on its 10 second run
    time.sleep(1.0)

    firstReturnedRequestLatency = None
    numConcurrentCompleted = 0
    sleepDuration = 1.0

    while threadObj.is_alive():
        #Make another request on the handle concurrently
        moduleStatuses = dcgm_agent.dcgmModuleGetStatuses(handle)
        secondRequestLatency = time.time() - diagStartTime
        numConcurrentCompleted += 1

        if firstReturnedRequestLatency is None:
            firstReturnedRequestLatency = secondRequestLatency

        time.sleep(sleepDuration)

    diagThreadEndTime = time.time()
    diagDuration = diagThreadEndTime - diagStartTime

    if firstReturnedRequestLatency is None:
        test_utils.skip_test(
            "Diag returned instantly. It is probably not supported for gpuId %u"
            % gpuId)

    logger.info("Completed %d concurrent requests. Diag ran for %.1f seconds" %
                (numConcurrentCompleted, diagDuration))

    #We should have been able to complete a request every 2 seconds if we slept for 1 (conservatively)
    numShouldHaveCompleted = int((diagDuration / sleepDuration) / 2.0)
    assert numConcurrentCompleted >= numShouldHaveCompleted, "Expected at least %d concurrent tests completed. Got %d" % (
        numShouldHaveCompleted, numConcurrentCompleted)
Пример #6
0
def test_logging_modules():
    """
    Verifies that module logging is functional
    """

    PASSED = "passed"
    FAILED = "FAILED"
    SKIPPED = "SKIPPED"

    result = FAILED

    nvhost_engine = apps.NvHostEngineApp()
    nvhost_engine.start(timeout=10)
    contents = None

    # Try for 5 seconds
    for i in range(25):
        time.sleep(0.2)
        with closing(open(nvhost_engine.dcgm_trace_fname)) as f:
            # pylint: disable=no-member
            contents = f.read()
            logger.debug("Read %d bytes from %s" %
                         (len(contents), nvhost_engine.dcgm_trace_fname))

            # NVSwitch module is loaded on startup. So we check for records from that module
            test_string = "Initialized logging for module 1"

            # Note that if --eris is passsed, we only log at WARNING level
            # If logging is not at DEBUG level, then skip the test
            if test_utils.loggingLevel != 'DEBUG':
                # Skipping in a roundabout way to ensure we terminate the processes we launch
                result = SKIPPED
                break
            if test_string in contents:
                result = PASSED
                break

    # Cleaning up
    nvhost_engine.terminate()
    nvhost_engine.validate()

    if (result == SKIPPED):
        test_utils.skip_test(
            "Detected logLevel = WARN. This test requires DEBUG. Likely cause: --eris option"
        )

    errorString = ""
    if (result != PASSED):
        if contents is not None:
            errorString = "Unable to find $test_string in log file"
        else:
            errorString = "log file %s was never read" % nvhost_engine.dcgm_trace_fname

    assert result == PASSED, errorString
Пример #7
0
def helper_test_blacklist_checks(handle, gpuIds):
    handleObj = DcgmHandle.DcgmHandle(handle=handle)
    settings = {}
    settings['instant'] = True
    settings['entity_get_flags'] = 0
    settings['testNames'] = '3'
    settings['hostname'] = 'localhost'
    settings[
        'watches'] = dcgm_structs.DCGM_HEALTH_WATCH_MEM | dcgm_structs.DCGM_HEALTH_WATCH_PCIE
    error_list = []

    ret = dcgm_internal_helpers.inject_field_value_i64(
        handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, 0, -50)
    blacklist_recommendations.check_health(handleObj, settings, error_list)

    # Make sure the GPUs pass a basic health test before running this test
    for gpuObj in blacklist_recommendations.g_gpus:
        if gpuObj.IsHealthy() == False:
            test_utils.skip_test("Skipping because GPU %d is not healthy. " %
                                 gpuObj.GetEntityId())

    # Inject a memory error and verify that we fail
    blacklist_recommendations.g_gpus = []  # Reset g_gpus

    ret = dcgm_internal_helpers.inject_field_value_i64(
        handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, 1000, 10)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    blacklist_recommendations.check_health(handleObj, settings, error_list)
    for gpuObj in blacklist_recommendations.g_gpus:
        if gpuObj.GetEntityId() == gpuIds[0]:
            assert gpuObj.IsHealthy(
            ) == False, "Injected error didn't trigger a failure on GPU %d" % gpuIds[
                0]
        else:
            assert gpuObj.IsHealthy(
            ), "GPU %d reported unhealthy despite not having an inserted error: '%s'" % (
                gpuIds[0], gpuObj.WhyUnhealthy())

    # Remove the memory monitor and make sure we pass our checks
    blacklist_recommendations.g_gpus = []  # Reset g_gpus
    settings['watches'] = dcgm_structs.DCGM_HEALTH_WATCH_PCIE
    blacklist_recommendations.check_health(handleObj, settings, error_list)
    for gpuObj in blacklist_recommendations.g_gpus:
        if gpuObj.GetEntityId() == gpuIds[0]:
            assert gpuObj.IsHealthy(
            ), "Injected error wasn't ignored for GPU %d: %s" % (
                gpuIds[0], gpuObj.WhyUnhealthy())
        else:
            assert gpuObj.IsHealthy(
            ), "GPU %d reported unhealthy despite not having an inserted error: '%s'" % (
                gpuIds[0], gpuObj.WhyUnhealthy())
Пример #8
0
def helper_throttling_masking_failures(handle, gpuId):
    #####
    # First check whether the GPU is healthy
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId],
                           testNamesStr="SM Stress",
                           paramsStr="sm stress.test_duration=2",
                           version=dcgm_structs.dcgmRunDiag_version)
    dd.SetThrottleMask(
        0
    )  # We explicitly want to fail for throttle reasons since this test inserts throttling errors
    # for verification
    dd.UseFakeGpus()
    response = test_utils.diag_execute_wrapper(dd, handle)
    if not check_diag_result_pass(response, gpuId,
                                  dcgm_structs.DCGM_SM_STRESS_INDEX):
        test_utils.skip_test(
            "Skipping because GPU %s does not pass SM Perf test. "
            "Please verify whether the GPU is supported and healthy." % gpuId)

    #####
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId],
                           testNamesStr="SM Stress",
                           paramsStr="sm stress.test_duration=15",
                           version=dcgm_structs.dcgmRunDiag_version)
    dd.SetThrottleMask(0)
    dd.UseFakeGpus()

    fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
    insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN
    interval = 0.1

    logger.info("Injecting benign errors")
    inject_value(handle, gpuId, fieldId, 3, 1, True)
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, 3, checkInterval=interval, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    logger.info("Injecting actual errors")
    inject_value(handle, gpuId, fieldId, insertedError, injection_offset, True)
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000,
                 injection_offset, True)

    logger.info("Started diag")
    response = test_utils.diag_execute_wrapper(dd, handle)
    # Verify that the inserted values are visible in DCGM
    # Max wait of 8 is because of 5 second offset + 2 seconds required for 20 matches + 1 second buffer.
    assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, insertedError, checkInterval=0.1, numMatches=1, maxWait=8), \
            "Expected inserted errors to be visible in DCGM"

    throttled, errMsg = find_throttle_failure(
        response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX)
    assert throttled, "Expected to find throttling failure, but did not: (%s)" % errMsg
Пример #9
0
def test_dcgm_diag_per_gpu_responses_standalone_dcgmi(handle, gpuIds):
    if len(gpuIds) < 2:
        test_utils.skip_test(
            "Skipping because this test requires 2 or more GPUs with same SKU")

    if test_utils.is_throttling_masked_by_nvvs(
            handle, gpuIds[0],
            dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN):
        test_utils.skip_test(
            "Skipping because this SKU ignores the throttling we inject for this test"
        )

    logger.info("Starting test for per gpu responses (dcgmi output)")
    helper_per_gpu_responses_dcgmi(handle, gpuIds)
Пример #10
0
def test_dcgm_standalone_metadata_memory_get_hostengine_sane(handle):
    """
    Sanity test for API that gets memory usage of the hostengine process
    """
    if not option_parser.options.developer_mode:
        test_utils.skip_test("Skipping developer test.")
    handle = pydcgm.DcgmHandle(handle)
    system = pydcgm.DcgmSystem(handle)
    system.introspect.UpdateAll() 
    
    bytesUsed = system.introspect.memory.GetForHostengine().bytesUsed 
    
    logger.debug('the hostengine process is using %.2f MB' % (bytesUsed / 1024. / 1024.))
    
    assert(1*1024*1024 < bytesUsed < 100*1024*1024), bytesUsed        # 1MB to 100MB
Пример #11
0
def helper_test_config_config_power_enforce(handle, gpuIds):
    """
    Checks if DCGM can enforce the power settings if it's changed behind the scenes
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")
    ## Add first GPU to the group
    groupObj.AddGpu(gpuIds[0])
    gpuIds = groupObj.GetGpuIds()  #Only reference GPUs we are testing against
    gpuId = gpuIds[0]

    ## Get Min and Max Power limit on the group
    attributes = systemObj.discovery.GetGpuAttributes(gpuId)

    ## Verify that power is supported on the GPUs in the group
    if dcgmvalue.DCGM_INT32_IS_BLANK(attributes.powerLimits.maxPowerLimit):
        test_utils.skip_test("Needs Power limit to be supported on the GPU")

    powerLimit_set_dcgmi = int((attributes.powerLimits.maxPowerLimit +
                                attributes.powerLimits.minPowerLimit) / 2)
    powerLimit_set_nvsmi = attributes.powerLimits.maxPowerLimit

    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL
    config_values.mPowerLimit.val = powerLimit_set_dcgmi

    groupObj.config.Set(config_values)

    logger.info("Verify if dcgmi configured value has taken effect")
    helper_verify_power_value(groupObj, powerLimit_set_dcgmi)

    ## Change Power limit to max from external entity like nvidia-smi
    assert 0 == apps.NvidiaSmiApp(["-pl", str(powerLimit_set_nvsmi), "-i", str(gpuIds[0])]).run(), \
        "Nvidia smi couldn't set the power limit"

    logger.info("Verify if nvsmi configured value has taken effect")
    helper_verify_power_value(groupObj, powerLimit_set_nvsmi)

    groupObj.config.Enforce()

    logger.info("Verify if dcgmi enforced value has taken effect")
    helper_verify_power_value(groupObj, powerLimit_set_dcgmi)
Пример #12
0
def test_dcgm_topology_device_standalone(handle, gpuIds):
    """
    Verifies that the topology get for the default group works
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetDefaultGroup()
    gpuIds = groupObj.GetGpuIds() #Use just the GPUs in our group

    if len(gpuIds) < 2:
        test_utils.skip_test("Needs >= 2 GPUs")

    topologyInfo = systemObj.discovery.GetGpuTopology(gpuIds[0])

    assert (topologyInfo.numGpus == len(gpuIds) - 1), "Expected %d, received numGpus = %d" % (len(gpuIds) - 1, topologyInfo.numGpus)
    assert (topologyInfo.cpuAffinityMask[0] != 0), "GPU 0 should have *some* affinity"
Пример #13
0
def helper_test_health_check_instances(handle, gpuIds):
    instances, cis = ensure_instance_ids(handle, gpuIds[0], 1, 1)
    instanceId = instances.keys()[0]
    ciId = cis.keys()[0]
    handleObj = DcgmHandle.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")
    groupObj.AddEntity(dcgm_fields.DCGM_FE_GPU, gpuIds[0])
    groupObj.AddEntity(dcgm_fields.DCGM_FE_GPU_I, instanceId)
    groupObj.AddEntity(dcgm_fields.DCGM_FE_GPU_CI, ciId)

    newSystems = dcgm_structs.DCGM_HEALTH_WATCH_MEM
    groupObj.health.Set(newSystems)

    # Verify health prior to testing
    responseV4 = groupObj.health.Check(
        dcgm_structs.dcgmHealthResponse_version4)
    if responseV4.incidentCount != 0:
        test_utils.skip_test("Cannot test on unhealthy systems.")

    # Inject one error per system
    dcgm_internal_helpers.inject_value(
        handle,
        gpuIds[0],
        dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
        2,
        5,
        isInt=True,
        verifyInsertion=True,
        entityType=dcgm_fields.DCGM_FE_GPU)

    responseV4 = groupObj.health.Check(
        dcgm_structs.dcgmHealthResponse_version4)
    assert (
        responseV4.incidentCount == 1
    ), "Should have 1 total incidents but found %d" % responseV4.incidentCount

    assert (responseV4.incidents[0].entityInfo.entityId == gpuIds[0])
    assert (responseV4.incidents[0].entityInfo.entityGroupId ==
            dcgm_fields.DCGM_FE_GPU)
    assert (responseV4.incidents[0].error.code ==
            dcgm_errors.DCGM_FR_VOLATILE_DBE_DETECTED)
    assert (
        responseV4.incidents[0].system == dcgm_structs.DCGM_HEALTH_WATCH_MEM)
    assert (
        responseV4.incidents[0].health == dcgm_structs.DCGM_HEALTH_RESULT_FAIL)
Пример #14
0
def test_logging_env_var():
    """
    Verifies that we log to the supplied env var
    """

    if test_utils.loggingLevel != 'DEBUG':
        test_utils.skip_test(
            "Detected logLevel != DEBUG. This test requires DEBUG. Likely cause: --eris option"
        )

    passed = False

    # Env var is automatically set in NvHostEngineApp
    nvhost_engine = apps.NvHostEngineApp()
    nvhost_engine.start(timeout=10)
    contents = None

    # Try for 5 seconds
    for i in range(25):
        time.sleep(0.2)
        with closing(open(nvhost_engine.dcgm_trace_fname,
                          encoding='utf-8')) as f:
            # pylint: disable=no-member
            contents = f.read()
            logger.debug("Read %d bytes from %s" %
                         (len(contents), nvhost_engine.dcgm_trace_fname))
            # This is checking two things:
            #   - that we are logging to the file specified in ENV
            #   - that we are setting severity according to ENV (DEBUG)
            if 'DEBUG' in contents:
                passed = True
                break

    # Cleaning up
    nvhost_engine.terminate()
    nvhost_engine.validate()

    errorString = ""
    if (not passed):
        if contents is not None:
            errorString = "Unable to find 'DEBUG' in log file"
        else:
            errorString = "log file %s was never read" % nvhost_engine.dcgm_trace_fname

    assert passed, errorString
Пример #15
0
def helper_dcgm_config_powerbudget(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")
    ## Add first GPU to the group
    groupObj.AddGpu(gpuIds[0])
    gpuIds = groupObj.GetGpuIds()  #Only reference GPUs we are testing against

    ## Get Min and Max Power limit on the group
    attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, gpuIds[0])

    ## Verify that power is supported on the GPUs in the group
    if dcgmvalue.DCGM_INT32_IS_BLANK(attributes.powerLimits.maxPowerLimit):
        test_utils.skip_test("Needs Power limit to be supported on the GPU")

    powerLimit = int((attributes.powerLimits.maxPowerLimit +
                      attributes.powerLimits.minPowerLimit) / 2)

    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgm_structs.DCGM_CONFIG_POWER_BUDGET_GROUP
    config_values.mPowerLimit.val = powerLimit * len(
        gpuIds)  #Assumes homogenous GPUs

    groupObj.config.Set(config_values)

    config_values = groupObj.config.Get(dcgm_structs.DCGM_CONFIG_CURRENT_STATE)
    assert len(config_values
               ) > 0, "Failed to get configuration using groupObj.config.Get"

    for x in range(0, len(gpuIds)):
        if (config_values[x].mPowerLimit.val !=
                dcgmvalue.DCGM_INT32_NOT_SUPPORTED):
            assert config_values[
                x].mPowerLimit.type == dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL, "The power limit type for gpuId %d is incorrect. Returned: %d Expected :%d" % (
                    x, config_values[x].mPowerLimit.type,
                    dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL)
            assert config_values[
                x].mPowerLimit.val == powerLimit, "The power limit value for gpuID %d is incorrect. Returned: %d Expected: %s" % (
                    x, config_values[x].mPowerLimit.val, powerLimit)
        pass
Пример #16
0
def helper_verify_diag_passing(handle,
                               gpuIds,
                               testNames="SM Stress",
                               testIndex=dcgm_structs.DCGM_SM_STRESS_INDEX,
                               params="sm stress.test_duration=15",
                               version=dcgm_structs.dcgmRunDiag_version,
                               useFakeGpus=False):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr=testNames,
                           paramsStr=params,
                           version=version)
    dd.SetThrottleMask(
        0
    )  # We explicitly want to fail for throttle reasons since this test inserts throttling errors
    # for verification
    if useFakeGpus:
        dd.UseFakeGpus()

    # If we've already chchecked this GPU, then use the previous result
    runDiag = False
    for gpuId in gpuIds:
        if gpuId in checked_gpus:
            if checked_gpus[gpuId] == False:
                test_utils.skip_test(
                    "Skipping because GPU %s does not pass SM Perf test. "
                    "Please verify whether the GPU is supported and healthy." %
                    gpuId)
        else:
            runDiag = True

    if runDiag == False:
        return dd

    response = test_utils.diag_execute_wrapper(dd, handle)
    for gpuId in gpuIds:
        if not check_diag_result_pass(response, gpuId, testIndex):
            checked_gpus[gpuId] = False
            test_utils.skip_test(
                "Skipping because GPU %s does not pass SM Perf test. "
                "Please verify whether the GPU is supported and healthy." %
                gpuId)
        else:
            checked_gpus[gpuId] = True

    return dd
Пример #17
0
def test_dcgm_topology_device_nvlink_standalone(handle, gpuIds):
    """
    Verifies that the topology get for the default group returns valid NVLINK info
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetDefaultGroup()
    gpuIds = groupObj.GetGpuIds() #Use just the GPUs in our group

    if len(gpuIds) < 2:
        test_utils.skip_test("Needs >= 2 GPUs")

    topologyInfo = systemObj.discovery.GetGpuTopology(gpuIds[0])

    if topologyInfo.gpuPaths[0].localNvLinkIds == 0:
        test_utils.skip_test("Needs NVLINK support")

    assert ((topologyInfo.gpuPaths[0].path & 0xFFFFFF00) > 0), "No NVLINK state set when localNvLinkIds is > 0"
Пример #18
0
def test_dcgm_topology_group_single_gpu_standalone(handle, gpuIds):
    """
    Verifies that the topology get for a group works for a single GPU
    """
    #Topology will work for a one-GPU group if there are > 1 GPUs on the system
    if len(gpuIds) < 2:
        test_utils.skip_test("Needs >= 2 GPUs")

    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")
    groupObj.AddGpu(gpuIds[0])
    gpuIds = groupObj.GetGpuIds() #Use just the GPUs in our group

    topologyInfo = groupObj.discovery.GetTopology()

    assert (topologyInfo.numaOptimalFlag > 0), "with a single GPU, numa is by default optimal"
    assert (topologyInfo.slowestPath == 0), "with a single GPU, slowest path shouldn't be set"
Пример #19
0
def test_linting_get_py_files_to_lint():
    '''
    A unit test for linting.py's _get_py_files_to_lint function.
    It tests that:
      - if a python file has no .pylint-passed file then it is linted
      - if a .pylint-passed file modify time is older than its python file 
        then it is linted
      - otherwise the file is not selected for linting
    '''

    if not option_parser.options.lint:
        test_utils.skip_test("not supported when the \"--no-lint\" option is used")

    notYetLinted = 'notYetLinted.py'
    oldLinted = 'oldLinted.py'
    newLinted = 'newLinted.py'
    
    tmpDir = tempfile.mkdtemp()
    
    def pylint_file(pyFile):
        return '.%s.pylint-passed' % pyFile
    
    # set up the py/pylint-passed files with the proper modification times
    for file in [notYetLinted, 
                 pylint_file(oldLinted), oldLinted, 
                 newLinted, pylint_file(newLinted)]:
        
        fp = os.path.join(tmpDir, file)
        with open(fp, 'a'):
            os.utime(fp, None)
        time.sleep(0.005)   # must wait so that modification time changes
            
    filesToLint = linting._get_py_files_to_lint(tmpDir)
    expectedFilesToLint = [
        os.path.join(tmpDir, file) for file in [notYetLinted, oldLinted]
    ]
    
    unexpectedFilesToLint = set(filesToLint) - set(expectedFilesToLint)
    assert len(unexpectedFilesToLint) == 0, \
        "These python files should not have been linted: %s" % unexpectedFilesToLint
        
    expectedFilesNotLinted = set(expectedFilesToLint) - set(filesToLint)
    assert len(expectedFilesNotLinted) == 0, \
        "These python files should have been linted: %s" % expectedFilesNotLinted
Пример #20
0
def test_linting_clear_lint_artifacts():
    '''
    A unit test for linting.py's _clear_lint_artifacts function.
    It tests that it actually clears any artifact files that were generated by pylint.
    '''
    
    if not option_parser.options.lint:
        test_utils.skip_test("not supported when the \"--no-lint\" option is used")

    tmpDir = tempfile.mkdtemp()
    artifactFile = os.path.join(tmpDir, '.file.py.pylint-passed')
    with open(artifactFile, 'w'):
        os.utime(artifactFile, None)
        
    assert os.path.isfile(artifactFile), 'Failed to create file for testing'
    
    linting._clear_lint_artifacts(tmpDir)
    
    assert not os.path.isfile(artifactFile), 'pylint artifact was not removed'
Пример #21
0
def test_dcgm_prof_watch_multipass(handle, gpuIds):
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    mpFieldIds = helper_get_multipass_field_ids(dcgmGroup)
    if mpFieldIds is None:
        test_utils.skip_test(
            "No multipass profiling fields exist for the gpu group")

    logger.info("Multipass fieldIds: " + str(mpFieldIds))

    #Make sure that multipass watching up to DLG_MAX_METRIC_GROUPS groups works
    for i in range(min(len(mpFieldIds), DLG_MAX_METRIC_GROUPS)):
        fieldIds = []
        for j in range(i + 1):
            fieldIds.extend(mpFieldIds[j])

        logger.info("Positive testing multipass fieldIds %s" % str(fieldIds))

        dcgmGroup.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0)
        dcgmGroup.profiling.UnwatchFields()

    if len(mpFieldIds) <= DLG_MAX_METRIC_GROUPS:
        test_utils.skip_test(
            "Skipping multipass failure test since there are %d <= %d multipass groups."
            % (len(mpFieldIds), DLG_MAX_METRIC_GROUPS))

    for i in range(DLG_MAX_METRIC_GROUPS + 1, len(mpFieldIds) + 1):
        fieldIds = []
        for j in range(i):
            fieldIds.extend(mpFieldIds[j])

        logger.info("Negative testing multipass fieldIds %s" % str(fieldIds))

        with test_utils.assert_raises(
                dcgm_structs.dcgmExceptionClass(
                    dcgm_structs.DCGM_ST_PROFILING_MULTI_PASS)):
            dcgmGroup.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0)
            dcgmGroup.profiling.UnwatchFields()