示例#1
0
def _sumMetadata(handle, getForFieldFn, getForAllFieldsFn, metaAttr):
    '''
    Return a 3-tuple where the first entry is the aggregate of summing the metadata for every 
    field, the second entry is the total before aggregating and the third entry is the total after aggregating.
    '''
    system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle))
    group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle),
                             groupName="test-metadata",
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)

    # watch every field possible on all GPUs
    watchedFields = test_utils.watch_all_fields(handle, group.GetGpuIds())

    system.introspect.UpdateAll()

    # Get the total before and after to accomodate for any slight changes
    # in total memory usage while the individual field amounts are being summed
    startVal = getForAllFieldsFn().aggregateInfo.__getattribute__(metaAttr)

    aggregateVal = sum(
        getForFieldFn(handle, fieldId).aggregateInfo.__getattribute__(metaAttr)
        for fieldId in watchedFields)

    endVal = getForAllFieldsFn().aggregateInfo.__getattribute__(metaAttr)

    return aggregateVal, startVal, endVal
示例#2
0
def test_dcgm_embedded_metadata_memory_get_field_sane(handle):
    '''
    Sanity test for API that gets memory usage of a single field
    '''
    if not option_parser.options.developer_mode:
        test_utils.skip_test("Skipping developer test.")
    handleObj = pydcgm.DcgmHandle(handle=handle)
    fieldIds = [
        dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
    ]
    fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds)

    group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle),
                             groupName="test-metadata",
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)
    system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle))

    _watch_field_group_basic(fieldGroup, handle, group.GetId())
    system.introspect.UpdateAll()

    memoryInfo = dcgm_agent_internal.dcgmIntrospectGetFieldMemoryUsage(
        handle, fieldIds[0])

    logger.debug("field %s using %.2f KB" %
                 (fieldIds[0], memoryInfo.aggregateInfo.bytesUsed / 1024.))

    # 0+ to 200 KB
    assert(0 < memoryInfo.aggregateInfo.bytesUsed < 1024*200), \
        'bytes used to store field was unreasonable for ID %s, bytes: %s' \
        % (fieldIds[0], memoryInfo.aggregateInfo.bytesUsed)
示例#3
0
def test_dcgm_prof_initial_valid_record(handle, gpuIds):
    '''
    Test that we can retrieve a valid FV for a profiling field immediately after watching
    '''
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    fieldIds = helper_get_single_pass_field_ids(dcgmGroup)
    assert fieldIds is not None

    #Set watches using a large interval so we don't get a record for 10 seconds in the bug case
    dcgmGroup.profiling.WatchFields(fieldIds, 10000000, 3600.0, 0)

    gpuId = gpuIds[0]

    fieldValues = dcgm_agent.dcgmEntityGetLatestValues(handle, dcgm_fields.DCGM_FE_GPU, gpuId, fieldIds)
    assert len(fieldValues) == len(fieldIds), "%d != %d" % (len(fieldValues), len(fieldIds))

    for i, fieldValue in enumerate(fieldValues):
        logger.info(str(fieldValue))
        assert(fieldValue.version != 0), "idx %d Version was 0" % i
        assert(fieldValue.fieldId == fieldIds[i]), "idx %d fieldValue.fieldId %d != fieldIds[i] %d" % (i, fieldValue.fieldId, fieldIds[i])
        assert(fieldValue.status == dcgm_structs.DCGM_ST_OK), "idx %d status was %d" % (i, fieldValue.status)
        #The following line catches the bug in Jira DCGM-1357. Previously, a record would be returned with a
        #0 timestamp
        assert(fieldValue.ts != 0), "idx %d timestamp was 0" % i
示例#4
0
def test_dcgm_prof_with_dcgmreader(handle, gpuIds):
    """ 
    Verifies that we can access profiling data with DcgmReader, which is the 
    base class for dcgm exporters
    """
    dcgmHandle = pydcgm.DcgmHandle(handle)
    dcgmSystem = dcgmHandle.GetSystem()

    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    fieldIds = helper_get_single_pass_field_ids(dcgmGroup)

    updateFrequencyUsec=10000
    sleepTime = 2 * (updateFrequencyUsec / 1000000.0) #Sleep 2x the update freq so we get new values each time

    dr = DcgmReader.DcgmReader(fieldIds=fieldIds, updateFrequency=updateFrequencyUsec, maxKeepAge=30.0, gpuIds=gpuIds)
    dr.SetHandle(handle)

    for i in range(10):
        time.sleep(sleepTime)

        latest = dr.GetLatestGpuValuesAsFieldIdDict()
        logger.info(str(latest))

        for gpuId in gpuIds:
            assert len(latest[gpuId]) == len(fieldIds), "i=%d, gpuId %d, len %d != %d" % (i, gpuId, len(latest[gpuIds[i]]), len(fieldIds))
示例#5
0
def test_dcgm_vgpu_config_get_validate(handle):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    gpuIdList = systemObj.discovery.GetAllGpuIds()
    assert len(gpuIdList
               ) >= 0, "Not able to find devices on the node for embedded case"

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")
    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
    status_handle = dcgm_agent.dcgmStatusCreate()

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmVgpuConfigGet(handle, groupId,
                                  dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
                                  groupInfo.count, status_handle, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmVgpuConfigGet(handle, groupId,
                                  dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
                                  groupInfo.count, status_handle, versionTest)
示例#6
0
def StartAppOnGpus(handle):
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    allGpuIds = dcgmSystem.discovery.GetAllGpuIds()

    gpuInfoList = []
    addedPids = []

    for gpuId in allGpuIds:
        gpuAttrib = dcgmSystem.discovery.GetGpuAttributes(gpuId)
        gpuInfoList.append((gpuId, gpuAttrib.identifiers.pciBusId))

    for info in gpuInfoList:
        gpuId = info[0]
        busId = info[1]
        appTimeout = int(1000)  #miliseconds

        #Start a cuda app so we have something to accounted
        appParams = [
            "--ctxCreate", busId, "--busyGpu", busId,
            str(appTimeout), "--ctxDestroy", busId
        ]
        app = apps.CudaCtxCreateAdvancedApp(
            appParams,
            env=test_utils.get_cuda_visible_devices_env(handle, gpuId))
        app.start(appTimeout * 2)
        pid = app.getpid()
        addedPids.append(pid)
        app.wait()
        app.terminate()
        app.validate()
        logger.info("Started PID %d." % pid)

    return addedPids
示例#7
0
def test_dcgm_policy_get_validate(handle):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    gpuIdList = systemObj.discovery.GetAllGpuIds()
    assert len(gpuIdList
               ) >= 0, "Not able to find devices on the node for embedded case"

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")
    status_handle = dcgm_agent.dcgmStatusCreate()
    count = 1

    diagLevel = dcgm_structs.DCGM_DIAG_LVL_SHORT

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmPolicyGet(handle, groupId, count, status_handle,
                              versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmPolicyGet(handle, groupId, count, status_handle,
                              versionTest)
示例#8
0
def test_dcgm_modules_get_statuses(handle):
    '''
    Do a basic sanity check of the DCGM module statuses returned
    '''
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    ms = dcgmSystem.modules.GetStatuses()

    assert ms.numStatuses == dcgm_structs.DcgmModuleIdCount, "%d != %d" % (
        ms.numStatuses, dcgm_structs.DcgmModuleIdCount)
    assert ms.statuses[0].id == dcgm_structs.DcgmModuleIdCore, "%d != %d" % (
        ms.statuses[0].id, dcgm_structs.DcgmModuleIdCore)
    assert ms.statuses[
        0].status == dcgm_structs.DcgmModuleStatusLoaded, "%d != %d" % (
            ms.statuses[0].status, dcgm_structs.DcgmModuleStatusLoaded)

    for i in range(1, ms.numStatuses):
        #.id == index
        assert ms.statuses[i].id == i, "%d != %d" % (ms.statuses[i].id, i)
        #Assert all non-core modules aren't loaded besides NvSwitch. This one can be loaded
        #because creating default groups causes a RPC to the NvSwitch manager
        if ms.statuses[i].id != dcgm_structs.DcgmModuleIdNvSwitch:
            assert ms.statuses[
                i].status == dcgm_structs.DcgmModuleStatusNotLoaded, "%d != %d" % (
                    ms.statuses[i].status,
                    dcgm_structs.DcgmModuleStatusNotLoaded)
示例#9
0
def helper_dcgm_config_get_attributes(handle):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetDefaultGroup()

    gpuIdList = groupObj.GetGpuIds()

    for gpuId in gpuIdList:
        attributes = systemObj.discovery.GetGpuAttributes(gpuId)
        assert (
            attributes.identifiers.deviceName !=
            dcgmvalue.DCGM_STR_NOT_SUPPORTED and
            attributes.identifiers.deviceName != dcgmvalue.DCGM_STR_NOT_FOUND
            and attributes.identifiers.deviceName !=
            dcgmvalue.DCGM_STR_NOT_SUPPORTED
            and attributes.identifiers.deviceName !=
            dcgmvalue.DCGM_STR_NOT_PERMISSIONED), "Not able to find attributes"

        #We used to assert that attributes.clockSets.count was > 0. This was because the NVML internal API that provided it
        #bypassed the SKU check. If nvidia-smi -q -d SUPPORTED_CLOCKS returns N/A, we will no longer have clockSets.

        for i in range(attributes.clockSets.count):
            memClock = attributes.clockSets.clockSet[i].memClock
            smClock = attributes.clockSets.clockSet[i].smClock

            assert memClock > 0 and memClock < 10000, "gpuId %d got memClock out of range 0 - 10000: %d" % (
                gpuId, memClock)
            assert smClock > 0 and smClock < 10000, "gpuId %d got smClock out of range 0 - 10000: %d" % (
                gpuId, smClock)
示例#10
0
def helper_dcgm_verify_sync_boost_single_gpu(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")

    ## Add first GPU to the group
    groupObj.AddGpu(gpuIds[0])
    gpuIds = groupObj.GetGpuIds()  #Only reference GPUs we are testing against

    ## Set the sync boost for the group
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = 1
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Config Set must return DCGM_ST_BADPARAM since we only have a single GPU
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)):
        groupObj.config.Set(config_values)

    groupObj.Delete()
示例#11
0
def test_dcgm_embedded_metadata_exectime_get_all_fields_sane(handle):
    """
    Sanity test for API that gets execution time of all fields together
    """
    if not option_parser.options.developer_mode:
        test_utils.skip_test("Skipping developer test.")
    handle = pydcgm.DcgmHandle(handle)
    system = pydcgm.DcgmSystem(handle)
    group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT)
    
    # watch a ton of fields so that we know that some are being stored
    updateFreqUsec = 1000
    test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=updateFreqUsec)
    system.introspect.UpdateAll()
    
    execTime = system.introspect.execTime.GetForAllFields().aggregateInfo
    
    perGpuSane = 300*1000 # 300 ms
    activeGpuCount = test_utils.get_live_gpu_count(handle.handle)
    saneLimit = perGpuSane*activeGpuCount
    
    # test that all struct fields in the API response have reasonable values
    assert(100 < execTime.totalEverUpdateUsec < saneLimit), (
        'execution time seems way too long for a system with %s gpus. Took %s ms. Sane limit: %s ms' 
        % (activeGpuCount, execTime.totalEverUpdateUsec/1000, saneLimit/1000))
    
    assert(100 < execTime.recentUpdateUsec < saneLimit), (
        'recent update time seems way too long for a system with %s gpus. Took %s ms. Sane limit: %s ms' 
        % (activeGpuCount, execTime.recentUpdateUsec/1000, saneLimit/1000))
    
    assert(updateFreqUsec-1 <= execTime.meanUpdateFreqUsec <= updateFreqUsec+1), execTime.meanUpdateFreqUsec
示例#12
0
def test_dcgm_prof_multi_pause_resume(handle, gpuIds):
    '''
    Test that we can pause and resume profiling over and over without error
    '''
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    #We should never get an error back from pause or resume. Pause and Resume throw exceptions on error
    numPauses = 0
    numResumes = 0

    for i in range(100):
        #Flip a coin and pause if we get 0. unpause otherwise (1)
        coin = random.randint(0,1)
        if coin == 0:
            dcgmSystem.profiling.Pause()
            numPauses += 1
        else:
            dcgmSystem.profiling.Resume()
            numResumes += 1

    logger.info("Got %d pauses and %d resumes" % (numPauses, numResumes))
示例#13
0
def helper_test_dpt_field_id(handle, gpuIds, fieldId, extraArgs = None):
    '''
    Test that we can retrieve a valid FV for a profiling field immediately after watching
    '''
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    cudaDriverVersion = test_utils.get_cuda_driver_version(handle, gpuIds[0])

    supportedFieldIds = helper_get_supported_field_ids(dcgmGroup)

    # Just test the first GPU of our SKU. Other tests will cover multiple SKUs
    useGpuIds = [gpuIds[0], ]

    args = ["--target-max-value", "--no-dcgm-validation", "--dvs", "--reset", "--mode", "validate", "-d", "15.0", "-r", "1.0", "--sync-count", "5", "-w", "5", "-t", str(fieldId)]

    if extraArgs is not None:
        args.extend(extraArgs)

    app = apps.DcgmProfTesterApp(cudaDriverMajorVersion=cudaDriverVersion[0], gpuIds=useGpuIds, args=args)
    app.start(timeout=120.0 * len(gpuIds)) #Account for slow systems but still add an upper bound
    app.wait()
示例#14
0
def test_dcgm_embedded_metadata_memory_get_field_group_sane(handle):
    '''
    Sanity test for API that gets memory usage of a single field group
    '''
    if not option_parser.options.developer_mode:
        test_utils.skip_test("Skipping developer test.")
    handle = pydcgm.DcgmHandle(handle)
    group = pydcgm.DcgmGroup(handle,
                             groupName='test-metadata',
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)
    system = pydcgm.DcgmSystem(handle)

    fieldIds = [
        dcgm_fields.DCGM_FI_DEV_GPU_TEMP, dcgm_fields.DCGM_FI_DEV_POWER_USAGE
    ]

    fieldGroup = pydcgm.DcgmFieldGroup(handle, "my_field_group", fieldIds)

    # ensure that the field group is watched
    _watch_field_group_basic(fieldGroup, handle.handle, group.GetId())
    system.introspect.UpdateAll()

    memoryInfo = system.introspect.memory.GetForFieldGroup(fieldGroup)

    logger.debug("field group %s is using %.2f KB" %
                 (fieldGroup.name, memoryInfo.aggregateInfo.bytesUsed / 1024.))

    # 0+ to 20 MB
    assert(0 < memoryInfo.aggregateInfo.bytesUsed < 1024*1024*20), \
        'bytes used to store field was unreasonable for field group %s, bytes: %s' \
        % (fieldGroup.name, memoryInfo.aggregateInfo.bytesUsed)
示例#15
0
def test_dcgm_embedded_metadata_exectime_get_field_group_sane(handle):
    """
    Sanity test for API that gets execution time of all fields together
    """
    if not option_parser.options.developer_mode:
        test_utils.skip_test("Skipping developer test.")
    handle = pydcgm.DcgmHandle(handle)
    system = pydcgm.DcgmSystem(handle)
    group = pydcgm.DcgmGroup(handle,
                             groupName="metadata-test",
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)

    fieldIds = [
        dcgm_fields.DCGM_FI_DEV_POWER_USAGE, dcgm_fields.DCGM_FI_DEV_SM_CLOCK,
        dcgm_fields.DCGM_FI_DEV_GPU_TEMP
    ]
    fieldGroup = pydcgm.DcgmFieldGroup(handle, "my_field_group", fieldIds)

    updateFreqUsec = 1000
    _watch_field_group_basic(fieldGroup,
                             handle.handle,
                             group.GetId(),
                             updateFreq=updateFreqUsec)
    system.introspect.UpdateAll()

    execTime = system.introspect.execTime.GetForFieldGroup(
        fieldGroup).aggregateInfo

    # test that all struct fields in the API response have reasonable values
    assert (100 < execTime.totalEverUpdateUsec <
            100 * 1000), execTime.totalEverUpdateUsec
    assert (100 < execTime.recentUpdateUsec <
            100 * 1000), execTime.recentUpdateUsec
    assert (updateFreqUsec == execTime.meanUpdateFreqUsec
            ), execTime.meanUpdateFreqUsec
示例#16
0
def test_dcgm_embedded_metadata_exectime_get_field_sane(handle):
    """
    Sanity test for API that gets execution time of a single field
    """
    if not option_parser.options.developer_mode:
        test_utils.skip_test("Skipping developer test.")

    handle = pydcgm.DcgmHandle(handle)
    system = pydcgm.DcgmSystem(handle)
    group = pydcgm.DcgmGroup(handle,
                             groupName="metadata-test",
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)

    updateFreqUsec = 1000
    dcgm_agent_internal.dcgmWatchFieldValue(handle.handle,
                                            group.GetGpuIds()[0],
                                            dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
                                            updateFreqUsec, 100000, 10)
    system.UpdateAllFields(True)
    system.introspect.UpdateAll()

    execTime = dcgm_agent_internal.dcgmIntrospectGetFieldExecTime(
        handle.handle, dcgm_fields.DCGM_FI_DEV_GPU_TEMP).aggregateInfo

    # test that all struct fields in the API response have reasonable values
    assert (100 < execTime.totalEverUpdateUsec <
            100 * 1000), execTime.totalEverUpdateUsec
    assert (100 < execTime.recentUpdateUsec <
            100 * 1000), execTime.recentUpdateUsec
    assert (updateFreqUsec == execTime.meanUpdateFreqUsec
            ), execTime.meanUpdateFreqUsec
示例#17
0
def test_dcgm_embedded_metadata_exectime_aggregate_is_sum_of_gpu_and_global(
        handle):
    """
    Ensure that when execution time is retrieved relating to fields that the "global" and "gpu" 
    values add up to the "aggregate" value
    """
    handle = pydcgm.DcgmHandle(handle)
    system = pydcgm.DcgmSystem(handle)
    group = pydcgm.DcgmGroup(handle,
                             groupName="metadata-test",
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)

    # watch a ton of fields so that we know that some are being stored
    test_utils.watch_all_fields(handle.handle,
                                group.GetGpuIds(),
                                updateFreq=100000)
    system.introspect.UpdateAll()

    execTimeInfo = system.introspect.execTime.GetForAllFields()

    gpuExecTime = sum(
        info.totalEverUpdateUsec
        for info in execTimeInfo.gpuInfo[:execTimeInfo.gpuInfoCount])

    if execTimeInfo.hasGlobalInfo:
        globalExecTime = execTimeInfo.globalInfo.totalEverUpdateUsec
    else:
        globalExecTime = 0

    assert (
        execTimeInfo.aggregateInfo.totalEverUpdateUsec == globalExecTime +
        gpuExecTime
    ), ('aggregate for all fields reports %s usec but GPUs report %s usec and global reports %s usec. '
        % (execTimeInfo.aggregateInfo.totalEverUpdateUsec, gpuExecTime,
           globalExecTime) + ' GPUs + global should sum to aggregate.')
示例#18
0
def test_dcgm_embedded_metadata_memory_get_aggregate_fields_equals_total(
        handle):
    system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle))

    _metadata_get_aggregate_fields_equals_total(
        handle, dcgm_agent_internal.dcgmIntrospectGetFieldMemoryUsage,
        system.introspect.memory.GetForAllFields, 'bytesUsed')
示例#19
0
def test_dcgm_embedded_metadata_memory_get_all_fields_sane(handle):
    """
    Sanity test for API that gets memory usage of all fields together
    """
    if not option_parser.options.developer_mode:
        test_utils.skip_test("Skipping developer test.")
    handle = pydcgm.DcgmHandle(handle)
    system = pydcgm.DcgmSystem(handle)
    group = pydcgm.DcgmGroup(handle,
                             groupName="metadata-test",
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)

    # watch a ton of fields so that we know that some are being stored
    test_utils.watch_all_fields(handle.handle,
                                group.GetGpuIds(),
                                updateFreq=1000)
    system.introspect.UpdateAll()

    memoryInfo = system.introspect.memory.GetForAllFields().aggregateInfo

    logger.debug('All fields in hostengine are using %.2f MB' %
                 (memoryInfo.bytesUsed / 1024. / 1024.))

    assert (1024 * 20 < memoryInfo.bytesUsed <
            100 * 1024 * 1024), memoryInfo.bytesUsed  # 20 KB to 100 MB
示例#20
0
def test_dcgm_embedded_metadata_memory_aggregate_is_sum_of_gpu_and_global(
        handle):
    """
    Ensure that when memory info is retrieved relating to fields that the "global" and "gpu" 
    values add up to the "aggregate" value
    """
    handle = pydcgm.DcgmHandle(handle)
    system = pydcgm.DcgmSystem(handle)
    group = pydcgm.DcgmGroup(handle,
                             groupName="metadata-test",
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)

    # watch a ton of fields so that we know that some are being stored
    test_utils.watch_all_fields(handle.handle,
                                group.GetGpuIds(),
                                updateFreq=100000)
    system.introspect.UpdateAll()

    memoryInfo = system.introspect.memory.GetForAllFields()

    gpuMemory = sum(mem.bytesUsed
                    for mem in memoryInfo.gpuInfo[:memoryInfo.gpuInfoCount])

    globalMemory = memoryInfo.globalInfo.bytesUsed if memoryInfo.hasGlobalInfo else 0

    if (memoryInfo.hasGlobalInfo):
        logger.debug('global mem info: %s' % (memoryInfo.globalInfo))

    for i in range(memoryInfo.gpuInfoCount):
        logger.debug('gpu mem info %s: %s' % (i, memoryInfo.gpuInfo[i]))

    assert (memoryInfo.aggregateInfo.bytesUsed == gpuMemory + globalMemory), (
        'aggregate for all fields reports %s bytes but a sum of GPU and global reports %s bytes. '
        % (memoryInfo.aggregateInfo.bytesUsed, gpuMemory + globalMemory) +
        ' These values should be equal.')
示例#21
0
def test_dcgm_group_get_all_ids_standalone(handle):
    """
    Get all the group IDS configured on the host engine
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()

    #Get the list of groups before we add ours so that we account for them
    groupIdListBefore = dcgm_agent.dcgmGroupGetAllIds(handle)

    expectedCount = len(groupIdListBefore)
    groupObjs = []

    for index in xrange(0, 10):
        expectedCount += 1
        name = 'Test'
        name += ` index `
        groupObj = systemObj.GetEmptyGroup(name)
        groupObjs.append(
            groupObj)  #keep reference so it doesn't go out of scope
        pass

    groupIdListAfter = dcgm_agent.dcgmGroupGetAllIds(handle)
    assert len(
        groupIdListAfter
    ) == expectedCount, "Num of groups less than expected. Expected: %d Returned %d" % (
        expectedCount, len(groupIdListAfter))
示例#22
0
def helper_dcgm_group_update_grp(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")

    groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                         "test1")

    gpuIdList = gpuIds
    assert len(gpuIdList) > 0, "Failed to get devices from the node"

    for gpuId in gpuIdList:
        groupObj.AddGpu(gpuId)
        gpuIdListAfterAdd = groupObj.GetGpuIds()
        assert gpuId in gpuIdListAfterAdd, "Expected gpuId %d in %s" % (
            gpuId, str(gpuIdListAfterAdd))

    for gpuId in gpuIdList:
        groupObj.RemoveGpu(gpuId)
        gpuIdListAfterAdd = groupObj.GetGpuIds()
        assert gpuId not in gpuIdListAfterAdd, "Expected gpuId %d NOT in %s" % (
            gpuId, str(gpuIdListAfterAdd))

    #Force the group to be deleted
    del (groupObj)
示例#23
0
def helper_dcgm_group_get_grp_info_entities(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")

    gpuIdList = gpuIds
    assert len(gpuIdList) > 0, "Failed to get devices from the node"

    for gpuId in gpuIdList:
        groupObj.AddEntity(dcgm_fields.DCGM_FE_GPU, gpuId)

    gpuIdListAfterAdd = groupObj.GetGpuIds()
    assert gpuIdList == gpuIdListAfterAdd, "Expected all GPUs from %s to be added. Got %s" % (
        str(gpuIdList), str(gpuIdListAfterAdd))

    entityListAfterAdd = groupObj.GetEntities()
    gpuList2 = []
    for entity in entityListAfterAdd:
        assert entity.entityGroupId == dcgm_fields.DCGM_FE_GPU, str(
            entity.entityGroupId)
        gpuList2.append(entity.entityId)
    assert gpuIdList == gpuList2, "Expected all GPUs from %s to be added. Got %s" % (
        str(gpuIdList), str(gpuList2))

    #Remove all GPUs
    for gpuId in gpuIdList:
        groupObj.RemoveEntity(dcgm_fields.DCGM_FE_GPU, gpuId)
    entityListAfterRem = groupObj.GetEntities()
    assert len(entityListAfterRem) == 0, str(entityListAfterRem)
示例#24
0
def test_dcgmproftester_parallel_gpus(handle, gpuIds):
    '''
    Test that we can successfully read dcgmproftester metrics multiple concurrent GPUs

    This tests a few things:
    1. That metrics work for more than GPU 0
    2. That metrics work for multiple GPUs at a time
    '''
    if len(gpuIds) < 2:
        test_utils.skip_test("Skipping multi-GPU test since there's only one of this SKU")

    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    cudaDriverVersion = test_utils.get_cuda_driver_version(handle, gpuIds[0])

    #Graphics activity works for every GPU that supports DCP. It also works reliably even under heavy concurrecy
    fieldIds = "1001" 

    args = ["--mode", "validate", "-d", "15.0", "-r", "1.0", "--sync-count", "5", "-w", "10", "-t", fieldIds]
    app = apps.DcgmProfTesterApp(cudaDriverMajorVersion=cudaDriverVersion[0], gpuIds=gpuIds, args=args)
    app.start(timeout=120.0 * len(gpuIds)) #Account for slow systems but still add an upper bound
    app.wait()
    app.validate() #Validate here so that errors are printed when they occur instead of at the end of the test
示例#25
0
    def _InitHandles(self):
        self._dcgmHandle = pydcgm.DcgmHandle(ipAddress=self._hostname)
        
        groupName = "error_mon_gpus" + self._pidPostfix
        self._allGpusGroup = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName, groupType=dcgm_structs.DCGM_GROUP_DEFAULT)
        print("Found %d GPUs" % (len(self._allGpusGroup.GetEntities())))

        groupName = "error_mon_nvswitches" + self._pidPostfix
        self._allNvSwitchesGroup = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName, groupType=dcgm_structs.DCGM_GROUP_DEFAULT_NVSWITCHES)
        print("Found %d NvSwitches" % len(self._allNvSwitchesGroup.GetEntities()))

        fgName = "error_mon_nvswitches" + self._pidPostfix
        self._nvSwitchErrorFieldGroup = pydcgm.DcgmFieldGroup(self._dcgmHandle, name=fgName, fieldIds=self._nvSwitchErrorFieldIds)
        
        fgName = "error_mon_gpus" + self._pidPostfix
        self._gpuErrorFieldGroup = pydcgm.DcgmFieldGroup(self._dcgmHandle, name=fgName, fieldIds=self._gpuErrorFieldIds)

        updateFreq = int(self._updateIntervalSecs / 2.0) * 1000000
        maxKeepAge = 3600.0 #1 hour
        maxKeepSamples = 0 #Rely on maxKeepAge

        self._nvSwitchWatcher = dcgm_field_helpers.DcgmFieldGroupEntityWatcher(
            self._dcgmHandle.handle, self._allNvSwitchesGroup.GetId(), 
            self._nvSwitchErrorFieldGroup, dcgm_structs.DCGM_OPERATION_MODE_AUTO,
            updateFreq, maxKeepAge, maxKeepSamples, 0)
        self._gpuWatcher = dcgm_field_helpers.DcgmFieldGroupEntityWatcher(
            self._dcgmHandle.handle, self._allGpusGroup.GetId(), 
            self._gpuErrorFieldGroup, dcgm_structs.DCGM_OPERATION_MODE_AUTO,
            updateFreq, maxKeepAge, maxKeepSamples, 0)
示例#26
0
def test_dcgm_prof_pause_resume_values(handle, gpuIds):
    '''
    Test that we get valid values when profiling is resumed and BLANK values when profiling is paused
    '''
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    fieldIds = helper_get_single_pass_field_ids(dcgmGroup)
    assert fieldIds is not None

    #10 ms watches so we can test quickly
    watchIntervalUsec = 10000
    sleepIntervalSec = 0.1 * len(gpuIds) #100 ms per GPU
    #Start paused. All the other tests start unpaused
    dcgmSystem.profiling.Pause()

    dcgmGroup.profiling.WatchFields(fieldIds, watchIntervalUsec, 60.0, 0)

    gpuId = gpuIds[0]

    fieldValues = dcgm_agent.dcgmEntityGetLatestValues(handle, dcgm_fields.DCGM_FE_GPU, gpuId, fieldIds)
    assert len(fieldValues) == len(fieldIds), "%d != %d" % (len(fieldValues), len(fieldIds))

    #All should be blank
    for i, fieldValue in enumerate(fieldValues):
        fv = dcgm_field_helpers.DcgmFieldValue(fieldValue)
        assert fv.isBlank, "Got nonblank fv index %d" % i

    #Resume. All should be valid
    dcgmSystem.profiling.Resume()

    time.sleep(sleepIntervalSec)

    fieldValues = dcgm_agent.dcgmEntityGetLatestValues(handle, dcgm_fields.DCGM_FE_GPU, gpuId, fieldIds)
    assert len(fieldValues) == len(fieldIds), "%d != %d" % (len(fieldValues), len(fieldIds))

    #All should be non-blank
    for i, fieldValue in enumerate(fieldValues):
        fv = dcgm_field_helpers.DcgmFieldValue(fieldValue)
        assert not fv.isBlank, "Got blank fv index %d" % i

    #Pause again. All should be blank
    dcgmSystem.profiling.Pause()

    time.sleep(sleepIntervalSec)

    fieldValues = dcgm_agent.dcgmEntityGetLatestValues(handle, dcgm_fields.DCGM_FE_GPU, gpuId, fieldIds)
    assert len(fieldValues) == len(fieldIds), "%d != %d" % (len(fieldValues), len(fieldIds))

    #All should be blank
    for i, fieldValue in enumerate(fieldValues):
        fv = dcgm_field_helpers.DcgmFieldValue(fieldValue)
        assert fv.isBlank, "Got nonblank fv index %d" % i

    #This shouldn't fail
    dcgmSystem.profiling.Resume()
示例#27
0
def test_dcgm_config_standalone_get_devices(handle):
    """
    Verifies that DCGM Engine returns list of devices
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    gpuIdList = systemObj.discovery.GetAllGpuIds()
    assert len(gpuIdList) >= 0, "Not able to find devices for standalone case"
示例#28
0
def test_dcgm_policy_negative_unregister_standalone(handle):
    """
    Verifies that the unregister function does not allow a bad groupId value
    """
    policy = pydcgm.DcgmGroupPolicy(pydcgm.DcgmHandle(handle), 9999, None)
    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        policy.Unregister(dcgm_structs.DCGM_POLICY_COND_DBE)
示例#29
0
def helper_dcgm_group_create_grp(handle):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    groupObj = pydcgm.DcgmGroup(handleObj, groupName="test1")
    groupId = groupObj.GetId()
    assert (groupId != 0)

    #Force the group to be deleted
    del (groupObj)
示例#30
0
def test_dcgmi_config(handle, gpuIds):
    """
    Test DCGMI config
    """
    assert len(gpuIds) > 0, "Failed to get devices from the node"

    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    # Getting GPU power limits
    for gpuId in gpuIds:
        gpuAttrib = dcgmSystem.discovery.GetGpuAttributes(gpuId)
        dft_pwr = str(gpuAttrib.powerLimits.defaultPowerLimit)
        max_pwr = str(gpuAttrib.powerLimits.maxPowerLimit)

    groupId = str(_create_dcgmi_group())

    ## keep args in this order. Changing it may break the test
    validArgsTestList = [
        ["group", "-g", groupId, "-a",
         str(gpuIds[0])],  # add gpu to group
        ["config", "--get", "-g", groupId],  # get default group configuration
        [
            "config", "--get", "-g", "0"
        ],  # get default group configuration by ID. This will work as long as group IDs start at 0
        ["config", "-g", groupId, "--set", "-P",
         dft_pwr],  # set default power limit
        ["config", "-g", groupId, "--set", "-P",
         max_pwr],  # set max power limit
        ["config", "--get", "-g", groupId,
         "--verbose"],  # get verbose default group configuration
        ["config", "--enforce", "-g",
         groupId],  # enforce default group configuration
        ["config", "--enforce", "-g",
         "0"]  # enforce group configuration on default group by ID
    ]

    # Setting the compute mode is only supported when MIG mode is not enabled.
    if not test_utils.is_mig_mode_enabled():
        # set group configuration on default group by ID
        validArgsTestList.append(["config", "--set", "-c", "0", "-g", "0"])

    #Config management only works when the host engine is running as root
    if utils.is_root():
        _test_valid_args(validArgsTestList)
    else:
        _test_invalid_args(validArgsTestList)

    ## keep args in this order. Changing it may break the test
    _test_invalid_args([
        ["config", "--get", "-g",
         "9999"],  # Can't get config of group that doesn't exist
        ["config", "--get", "-g", "9999",
         "--verbose"],  # Can't get config of group that doesn't exist
        ["config", "--set", ""],  # Can't set group configuration to nothing
        ["config", "--set", "-c", "5"],  # Can't set an invalid compute mode
        ["config", "--enforce", "-g",
         "9999"]  # Can't enforce a configuration of group that doesn't exist
    ])