示例#1
0
def helper_test_inject_instance_fields(handle, gpuIds):
    instances, cis = ensure_instance_ids(handle, gpuIds[0], 1, 1)
    firstInstanceId = instances.keys()[0]
    lastCIId = cis.keys()[0]

    # Set up the watches on these groups
    groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                         'tien')
    fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(
        handle, [dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL], 'kal')

    dcgm_agent.dcgmGroupAddEntity(handle, groupId, dcgm_fields.DCGM_FE_GPU,
                                  gpuIds[0])
    dcgm_agent.dcgmGroupAddEntity(handle, groupId, dcgm_fields.DCGM_FE_GPU_I,
                                  firstInstanceId)
    dcgm_agent.dcgmGroupAddEntity(handle, groupId, dcgm_fields.DCGM_FE_GPU_CI,
                                  lastCIId)
    dcgm_agent.dcgmWatchFields(handle, groupId, fieldGroupId, 1, 100, 100)

    dcgm_internal_helpers.inject_value(
        handle,
        gpuIds[0],
        dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
        2,
        5,
        isInt=True,
        verifyInsertion=True,
        entityType=dcgm_fields.DCGM_FE_GPU)

    # Read the values to make sure they were stored properly
    entities = [
        dcgm_structs.c_dcgmGroupEntityPair_t(),
        dcgm_structs.c_dcgmGroupEntityPair_t(),
        dcgm_structs.c_dcgmGroupEntityPair_t()
    ]

    entities[0].entityGroupId = dcgm_fields.DCGM_FE_GPU_I
    entities[0].entityId = firstInstanceId
    entities[1].entityGroupId = dcgm_fields.DCGM_FE_GPU_CI
    entities[1].entityId = lastCIId
    entities[2].entityGroupId = dcgm_fields.DCGM_FE_GPU
    entities[2].entityId = gpuIds[0]

    fieldIds = [dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL]

    values = dcgm_agent.dcgmEntitiesGetLatestValues(handle, entities, fieldIds,
                                                    0)
    for v in values:
        if v.entityGroupId == dcgm_fields.DCGM_FE_GPU:
            assert v.value.i64 == 2, "Failed to inject value 2 for entity %u from group %u" % (
                v.entityId, v.entityGroupId)
        else:
            from dcgm_structs import DCGM_ST_NO_DATA
            assert (
                v.status == DCGM_ST_NO_DATA
            ), "Injected meaningless value %u for entity %u from group %u" % (
                v.value.i64, v.entityId, v.entityGroupId)
示例#2
0
def test_dcgm_prof_all_supported_fields_watchable(handle, gpuId):
    '''
    Verify that all fields that are reported as supported are watchable and 
    that values can be returned for them
    '''
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', [gpuId])

    helper_check_profiling_environment(dcgmGroup)

    fieldIds = helper_get_supported_field_ids(dcgmGroup)
    assert fieldIds is not None

    watchFreq = 1000 #1 ms
    maxKeepAge = 60.0
    maxKeepSamples = 0
    maxAgeUsec = int(maxKeepAge) * watchFreq

    entityPairList = [dcgm_structs.c_dcgmGroupEntityPair_t(dcgm_fields.DCGM_FE_GPU, gpuId)]

    for fieldId in fieldIds:
        # If there are only one unsupported SKUs in the group, WatchFields should return an error.
        # If at least one GPU in the group is supported, WatchFields will be successful.
        # The described logic is used to skip unsupported or fake SKUs.
        if dcgmGroup.profiling.WatchFields([fieldId, ], watchFreq, maxKeepAge,
                                           maxKeepSamples) == dcgm_structs.DCGM_ST_PROFILING_NOT_SUPPORTED:
            test_utils.skip_test_supported("DCP")

        # Sending a request to the profiling manager guarantees that an update cycle has happened since 
        # the last request
        dcgmGroup.profiling.GetSupportedMetricGroups()

        # validate watch freq, quota, and watched flags
        cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(handle, gpuId, fieldId)
        assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED) != 0, "gpuId %u, fieldId %u not watched" % (gpuId, fieldId)
        assert cmfi.numSamples > 0
        assert cmfi.numWatchers == 1, "numWatchers %d" % cmfi.numWatchers
        assert cmfi.monitorFrequencyUsec == watchFreq, "monitorFrequencyUsec %u != watchFreq %u" % (cmfi.monitorFrequencyUsec, watchFreq)
        assert cmfi.lastStatus == dcgm_structs.DCGM_ST_OK, "lastStatus %u != DCGM_ST_OK" % (cmfi.lastStatus)

        fieldValues = dcgm_agent.dcgmEntitiesGetLatestValues(handle, entityPairList, [fieldId, ], 0)

        for i, fieldValue in enumerate(fieldValues):
            logger.debug(str(fieldValue))
            assert(fieldValue.status == dcgm_structs.DCGM_ST_OK), "idx %d status was %d" % (i, fieldValue.status)
            assert(fieldValue.ts != 0), "idx %d timestamp was 0" % (i)

        dcgmGroup.profiling.UnwatchFields()

        #Validate watch flags after unwatch
        cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(handle, gpuId, fieldId)
        assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED) == 0, "gpuId %u, fieldId %u still watched. flags x%X" % (gpuId, fieldId, cmfi.flags)
        assert cmfi.numWatchers == 0, "numWatchers %d" % cmfi.numWatchers
示例#3
0
def verify_profile_names_exist(handle, migEntityList, isGpuInstance):
    fieldIds = [dcgm_fields.DCGM_FI_DEV_NAME]
    entities = []
    for entityId in migEntityList:
        entity = dcgm_structs.c_dcgmGroupEntityPair_t()
        if isGpuInstance:
            entity.entityGroupId = dcgm_fields.DCGM_FE_GPU_I
        else:
            entity.entityGroupId = dcgm_fields.DCGM_FE_GPU_CI
        entity.entityId = entityId
        entities.append(entity)

    values = dcgm_agent.dcgmEntitiesGetLatestValues(
        handle, entities, fieldIds, dcgm_structs.DCGM_FV_FLAG_LIVE_DATA)

    for v in values:
        assert len(v.value.str) and v.value.str != dcgmvalue.DCGM_STR_BLANK, \
               "Expected a non-empty profile name, but found '%s'" % (v.value.str)
示例#4
0
def verify_fake_profile_names(handle, fakeEntities, isGpuInstance):
    fieldIds = [dcgm_fields.DCGM_FI_DEV_NAME]
    entities = []
    for entityId in fakeEntities:
        entity = dcgm_structs.c_dcgmGroupEntityPair_t()
        if isGpuInstance:
            entity.entityGroupId = dcgm_fields.DCGM_FE_GPU_I
        else:
            entity.entityGroupId = dcgm_fields.DCGM_FE_GPU_CI
        entity.entityId = entityId
        entities.append(entity)

    values = dcgm_agent.dcgmEntitiesGetLatestValues(
        handle, entities, fieldIds, dcgm_structs.DCGM_FV_FLAG_LIVE_DATA)

    if isGpuInstance:
        expectedFakeName = "1fg.4gb"
    else:
        expectedFakeName = "1fc.1g.4gb"

    for v in values:
        assert v.value.str == expectedFakeName, "Fake profile name appears to be wrong. Expected '%s', found '%s'" % (
            expectedFakeName, v.value.str)
示例#5
0
文件: test_prof.py 项目: omertuc/DCGM
def test_dcgm_prof_all_supported_fields_watchable(handle, gpuIds):
    '''
    Verify that all fields that are reported as supported are watchable and 
    that values can be returned for them
    '''
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    fieldIds = helper_get_supported_field_ids(dcgmGroup)
    assert fieldIds is not None

    watchFreq = 1000  #1 ms
    maxKeepAge = 60.0
    maxKeepSamples = 0
    maxAgeUsec = int(maxKeepAge) * watchFreq

    entityPairList = []
    for gpuId in gpuIds:
        entityPairList.append(
            dcgm_structs.c_dcgmGroupEntityPair_t(dcgm_fields.DCGM_FE_GPU,
                                                 gpuId))

    for fieldId in fieldIds:
        dcgmGroup.profiling.WatchFields([
            fieldId,
        ], watchFreq, maxKeepAge, maxKeepSamples)

        # Sending a request to the profiling manager guarantees that an update cycle has happened since
        # the last request
        dcgmGroup.profiling.GetSupportedMetricGroups()

        # validate watch freq, quota, and watched flags
        for gpuId in gpuIds:
            cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
                handle, gpuId, fieldId)
            assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED
                    ) != 0, "gpuId %u, fieldId %u not watched" % (gpuId,
                                                                  fieldId)
            assert cmfi.numSamples > 0
            assert cmfi.numWatchers == 1, "numWatchers %d" % cmfi.numWatchers
            assert cmfi.monitorFrequencyUsec == watchFreq, "monitorFrequencyUsec %u != watchFreq %u" % (
                cmfi.monitorFrequencyUsec, watchFreq)
            assert cmfi.lastStatus == dcgm_structs.DCGM_ST_OK, "lastStatus %u != DCGM_ST_OK" % (
                cmfi.lastStatus)

        fieldValues = dcgm_agent.dcgmEntitiesGetLatestValues(
            handle, entityPairList, [
                fieldId,
            ], 0)

        for i, fieldValue in enumerate(fieldValues):
            logger.debug(str(fieldValue))
            assert (fieldValue.status == dcgm_structs.DCGM_ST_OK
                    ), "idx %d status was %d" % (i, fieldValue.status)
            assert (fieldValue.ts != 0), "idx %d timestamp was 0" % (i)

        dcgmGroup.profiling.UnwatchFields()

        #Validate watch flags after unwatch
        for gpuId in gpuIds:
            cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
                handle, gpuId, fieldId)
            assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED
                    ) == 0, "gpuId %u, fieldId %u still watched. flags x%X" % (
                        gpuId, fieldId, cmfi.flags)
            assert cmfi.numWatchers == 0, "numWatchers %d" % cmfi.numWatchers
示例#6
0
def helper_test_mig_value_reporting(handle, gpuIds):
    # These fields should report the same value for GPUs, instances, and compute instances
    sameValueFieldIds = [
        dcgm_fields.DCGM_FI_DEV_COMPUTE_MODE,
        dcgm_fields.DCGM_FI_DEV_MIG_MODE,
        dcgm_fields.DCGM_FI_DEV_SHUTDOWN_TEMP,
    ]

    differentValueFieldIds = [
        dcgm_fields.DCGM_FI_DEV_FB_TOTAL,
    ]

    newGpuInstances, newComputeInstances = create_mig_entities_and_verify(
        handle, gpuIds, 3, 1)

    # Make sure we get the same values for these fields on the GPU, instances, and compute instances

    # Build the entity list
    entities = []
    for gpuId in gpuIds:
        entities.append(
            dcgm_structs.c_dcgmGroupEntityPair_t(dcgm_fields.DCGM_FE_GPU,
                                                 gpuId))
    for instanceId in newGpuInstances:
        entities.append(
            dcgm_structs.c_dcgmGroupEntityPair_t(dcgm_fields.DCGM_FE_GPU_I,
                                                 instanceId))
    for ciId in newComputeInstances:
        entities.append(
            dcgm_structs.c_dcgmGroupEntityPair_t(dcgm_fields.DCGM_FE_GPU_CI,
                                                 ciId))

    fieldIds = []
    fieldIds.extend(sameValueFieldIds)
    fieldIds.extend(differentValueFieldIds)
    values = dcgm_agent.dcgmEntitiesGetLatestValues(
        handle, entities, fieldIds, dcgm_structs.DCGM_FV_FLAG_LIVE_DATA)
    gpuValues = {}

    # Make a map of a map the values reported by the GPUs: gpuId -> fieldId -> value
    for value in values:
        if value.entityGroupId == dcgm_fields.DCGM_FE_GPU:
            if value.entityId not in gpuValues:
                gpuValues[value.entityId] = {}
                gpuValues[value.entityId][value.fieldId] = value.value.i64
            elif value.fieldId not in gpuValues[value.entityId]:
                gpuValues[value.entityId][value.fieldId] = value.value.i64

    errMsg = ''
    for value in values:
        if value.entityGroupId == dcgm_fields.DCGM_FE_GPU_I:
            gpuId = value.entityId % dcgm_structs.DCGM_MAX_INSTANCES_PER_GPU
            same = gpuValues[gpuId][value.fieldId] == value.value.i64
            if not same and value.fieldId in sameValueFieldIds:
                errMsg = errMsg + "\nExpected %d but found %d for field %d GPU instance %d on GPU %d" \
                          % (gpuValues[gpuId][value.fieldId], value.value.i64, value.fieldId, value.entityId, gpuId)
            elif same and value.fieldId in differentValueFieldIds:
                errMsg = errMsg + "\nExpected different values but found %d for field %d for GPU instance %d on GPU %d" \
                          % (value.value.i64, value.fieldId, value.entityId, gpuId)
        if value.entityGroupId == dcgm_fields.DCGM_FE_GPU_CI:
            gpuId = value.entityId % dcgm_structs.DCGM_MAX_COMPUTE_INSTANCES_PER_GPU
            same = gpuValues[gpuId][value.fieldId] == value.value.i64
            if not same and value.fieldId in sameValueFieldIds:
                errMsg = errMsg + "\nExpected %d but found %d for field %d compute instance %d on GPU %d" \
                          % (gpuValues[gpuId][value.fieldId], value.value.i64, value.fieldId, value.entityId, gpuId)
            elif same and value.fieldId in differentValueFieldIds:
                errMsg = errMsg + "\nExpected different values but found %d for field %d for compute instance %d on GPU %d" \
                          % (value.value.i64, value.fieldId, value.entityId, gpuId)

    ciFailMsg = delete_compute_instances_and_verify(handle,
                                                    newComputeInstances)
    instanceFailMsg = delete_gpu_instances_and_verify(handle, newGpuInstances)

    if ciFailMsg != '':
        logger.warning("The compute instances didn't clean up correctly: %s" %
                       ciFailMsg)
    if instanceFailMsg != '':
        logger.warning("The GPU instances didn't clean up correctly: %s" %
                       instanceFailMsg)

    assert errMsg == '', errMsg