Пример #1
0
def main():
    print('Some examples of different DcgmReader usages')

    print('\n\nThe default interaction')
    dr = DcgmReader()
    dr.Process()

    print('\n\nUsing custom fields through the dictionary interface...')
    customFields = [
        dcgm_fields.DCGM_FI_DEV_MEM_COPY_UTIL,
        dcgm_fields.DCGM_FI_DEV_GPU_UTIL, dcgm_fields.DCGM_FI_DEV_POWER_USAGE
    ]
    DcgmReaderDictionary(field_ids=customFields)

    print(
        '\n\nProcessing in field order by overriding the CustomerDataHandler() method'
    )
    cdr = DataHandlerReader()
    cdr.Process()

    print(
        '\n\nPrinting a little differently by overriding the CustomFieldHandler() method'
    )
    fhr = FieldHandlerReader()
    fhr.Process()
Пример #2
0
def test_reading_pid_fields(handle, gpuIds, cudaApp):
    """
    Verifies that we can decode PID structs
    """
    fieldTag = dcgm_fields.DCGM_FI_DEV_COMPUTE_PIDS
    pids = []

    dr = DcgmReader(fieldIds=[fieldTag], updateFrequency=100000)
    logger.debug("Trying for 2 seconds")
    exit_loop = False
    for _ in range(10):
        if (exit_loop):
            break

        data = dr.GetLatestGpuValuesAsFieldIdDict()
        assert len(data) > 0

        for gpuId in data:
            gpuData = data[gpuId]
            if fieldTag in gpuData:
                pids.append(gpuData[fieldTag].pid)
                if gpuData[fieldTag].pid == cudaApp.getpid():
                    # Found our PID. Exit the loop
                    exit_loop = True
        time.sleep(0.2)

    logger.debug("PIDs: %s. cudaApp PID: %d" % (str(pids), cudaApp.getpid()))
    assert cudaApp.getpid() in pids, "could not find cudaApp PID"
Пример #3
0
def test_dcgm_reader_default(handle):
    # pylint: disable=undefined-variable
    dr = DcgmReader()
    dr.SetHandle(handle)
    latest = dr.GetLatestGpuValuesAsFieldNameDict()

    for gpuId in latest:
        # latest data might be less than the list, because blank values aren't included
        # Defined in DcgmReader
        # pylint: disable=undefined-variable
        assert len(latest[gpuId]) <= len(defaultFieldIds)

        # Make sure we get strings
        for key in latest[gpuId]:
            assert isinstance(key, str)

    sample = dr.GetLatestGpuValuesAsFieldIdDict()

    for gpuId in sample:
        # Defined in DcgmReader
        # pylint: disable=undefined-variable
        assert len(sample[gpuId]) <= len(defaultFieldIds)

        # Make sure we get valid integer field ids
        for fieldId in sample[gpuId]:
            assert isinstance(fieldId, int)
            assert dcgm_fields.DcgmFieldGetById(fieldId) != None
Пример #4
0
def test_dcgm_reader_specific_fields(handle):
    specificFields = [
        dcgm_fields.DCGM_FI_DEV_POWER_USAGE, dcgm_fields.DCGM_FI_DEV_XID_ERRORS
    ]
    dr = DcgmReader(fieldIds=specificFields)
    dr.SetHandle(handle)
    latest = dr.GetLatestGpuValuesAsFieldNameDict()

    for gpuId in latest:
        assert len(latest[gpuId]) <= len(specificFields)
Пример #5
0
def test_dcgm_prof_with_dcgmreader(handle, gpuIds):
    """ 
    Verifies that we can access profiling data with DcgmReader, which is the 
    base class for dcgm exporters
    """
    dcgmHandle = pydcgm.DcgmHandle(handle)
    dcgmSystem = dcgmHandle.GetSystem()

    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    fieldIds = helper_get_single_pass_field_ids(dcgmGroup)

    updateFrequencyUsec=10000
    sleepTime = 2 * (updateFrequencyUsec / 1000000.0) #Sleep 2x the update freq so we get new values each time

    dr = DcgmReader.DcgmReader(fieldIds=fieldIds, updateFrequency=updateFrequencyUsec, maxKeepAge=30.0, gpuIds=gpuIds)
    dr.SetHandle(handle)

    for i in range(10):
        time.sleep(sleepTime)

        latest = dr.GetLatestGpuValuesAsFieldIdDict()
        logger.info(str(latest))

        for gpuId in gpuIds:
            assert len(latest[gpuId]) == len(fieldIds), "i=%d, gpuId %d, len %d != %d" % (i, gpuId, len(latest[gpuIds[i]]), len(fieldIds))
Пример #6
0
def test_reading_specific_data(handle, gpuIds):
    """ 
    Verifies that we can inject specific data and get that same data back
    """

    dcgmHandle = pydcgm.DcgmHandle(handle)
    dcgmSystem = dcgmHandle.GetSystem()

    specificFieldIds = [
        dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
        dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
        dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
    ]
    fieldValues = [
        1,
        1000,
        9000,
    ]

    for i in range(0, len(specificFieldIds)):
        field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
        field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
        field.fieldId = specificFieldIds[i]
        field.status = 0
        field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
        field.ts = int((time.time() + 10) *
                       1000000.0)  # set the injected data into the future
        field.value.i64 = fieldValues[i]
        ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0],
                                                       field)
        assert (ret == dcgm_structs.DCGM_ST_OK)

    # pylint: disable=undefined-variable
    dr = DcgmReader(fieldIds=specificFieldIds)
    dr.SetHandle(handle)
    latest = dr.GetLatestGpuValuesAsFieldIdDict()

    assert len(latest[gpuIds[0]]) == len(specificFieldIds)

    for i in range(0, len(specificFieldIds)):
        assert latest[gpuIds[0]][specificFieldIds[i]] == fieldValues[i]
Пример #7
0
def util_dcgm_reader_all_since_last_call(handle, flag, repeat):
    """
    Test to ensure GetAllValuesAsDictSinceLastCall behaves. It was first used
    for collectd integration to ensure it does not crash and also checks that
    no unexpected fields are returned.
    
    Arguments:
        handle: DCGM handle
        flag:   argument for GetAllGpuValuesAsDictSinceLastCall
        repeat: whether to repeat GetAllGpuValuesAsDictsSinceLastCall call
    """
    specificFields = [
        dcgm_fields.DCGM_FI_DEV_POWER_USAGE, dcgm_fields.DCGM_FI_DEV_XID_ERRORS
    ]
    # pylint: disable=undefined-variable
    dr = DcgmReader(fieldIds=specificFields)
    dr.SetHandle(handle)
    latest = dr.GetAllGpuValuesAsDictSinceLastCall(flag)

    if repeat:
        latest = dr.GetAllGpuValuesAsDictSinceLastCall(flag)

    if flag == False:
        dcgmHandle = pydcgm.DcgmHandle(handle)
        dcgmSystem = dcgmHandle.GetSystem()
        fieldTags = []

        for fieldId in specificFields:
            fieldTags.append(dcgmSystem.fields.GetFieldById(fieldId).tag)

    for gpuId in latest:
        # Latest data might be less than the list, because blank values aren't
        # included. We basically try to ensure there is no crash and we don't
        # return something absurd.
        assert len(latest[gpuId]) <= len(specificFields)

        for key in latest[gpuId].keys():
            if flag == False:
                assert key in fieldTags
            else:
                assert key in specificFields
Пример #8
0
def DcgmReaderDictionary(field_ids=defaultFieldIds,
                         update_frequency=10000000,
                         keep_time=3600.0,
                         ignores=[],
                         field_groups='dcgm_fieldgroupdata'):
    # Instantiate a DcgmReader object
    dr = DcgmReader(fieldIds=field_ids,
                    updateFrequency=update_frequency,
                    maxKeepAge=keep_time,
                    ignoreList=ignores,
                    fieldGroupName=field_groups)

    # Get the default list of fields as a dictionary of dictionaries:
    # gpuId -> field name -> field value
    data = dr.GetLatestGpuValuesAsFieldNameDict()

    # Print the dictionary
    for gpuId in data:
        for fieldName in data[gpuId]:
            print("For gpu %s field %s=%s" %
                  (str(gpuId), fieldName, data[gpuId][fieldName]))