Exemplo n.º 1
0
def test_dcgm_agent_get_values_for_fields(handle, gpuIds):
    """
    Verifies that DCGM Engine can be initialized successfully
    """
    # Watch field so we can fetch it
    fieldId = dcgm_fields.DCGM_FI_DEV_NAME
    gpuId = gpuIds[0]

    ret = dcgm_agent_internal.dcgmWatchFieldValue(handle, gpuId, fieldId,
                                                  10000000, 86400.0, 0)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    # wait for at least one update of the field before trying to read it
    ret = dcgm_agent.dcgmUpdateAllFields(handle, True)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    values = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, gpuId, [
            fieldId,
        ])
    assert values[0].status == dcgm_structs.DCGM_ST_OK
    assert chr(
        values[0].fieldType
    ) == dcgm_fields.DCGM_FT_STRING, "Wrong field type: %s" % values[
        0].fieldType
    assert len(values[0].value.str) > 0
    logger.debug("Brand of GPU %u is %s" % (gpuId, values[0].value.str))
Exemplo n.º 2
0
    def GetGpus(self):
        """
        Populate self.gpus
        """
        self.groupId = dcgm_agent.dcgmGroupCreate(
            self.heHandle, dcgm_structs.DCGM_GROUP_DEFAULT, self.groupName)
        groupInfo = dcgm_agent.dcgmGroupGetInfo(
            self.heHandle, self.groupId, dcgm_structs.c_dcgmGroupInfo_version2)

        gpuIds = groupInfo.gpuIdList[0:groupInfo.count]

        self.Log("Running on %d GPUs" % len(gpuIds))

        for gpuId in gpuIds:
            newGpu = ProcessStatsStressGpu()
            newGpu.gpuId = gpuId
            self.gpus.append(newGpu)

            #Get the busid of the GPU
            fieldId = dcgm_fields.DCGM_FI_DEV_PCI_BUSID
            updateFreq = 100000
            maxKeepAge = 3600.0  #one hour
            maxKeepEntries = 0  #no limit

            dcgm_agent_internal.dcgmWatchFieldValue(self.heHandle, gpuId,
                                                    fieldId, updateFreq,
                                                    maxKeepAge, maxKeepEntries)

        #Update all of the new watches
        dcgm_agent.dcgmUpdateAllFields(self.heHandle, 1)

        for gpu in self.gpus:
            values = dcgm_agent_internal.dcgmGetLatestValuesForFields(
                self.heHandle, gpuId, [
                    fieldId,
                ])
            busId = values[0].value.str
            gpu.busId = busId

            self.Log("    GPUID %d, busId %s" % (gpu.gpuId, gpu.busId))
Exemplo n.º 3
0
def helper_test_dcgm_policy_inject_xiderror(handle, gpuIds):
    """ 
    Verifies that we can inject an XID error and receive a callback
    """
    newPolicy = dcgm_structs.c_dcgmPolicy_v1()

    newPolicy.version = dcgm_structs.dcgmPolicy_version1
    newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_XID
    newPolicy.parms[6].tag = 0
    newPolicy.parms[6].val.boolean = True

    dcgmHandle = pydcgm.DcgmHandle(handle)
    validDeviceId = -1
    devices = gpuIds
    for x in devices:
        fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
            handle, x, [
                dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
            ])
        if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
            validDeviceId = x
            break
    if (validDeviceId == -1):
        test_utils.skip_test(
            "Can only run if at least one GPU that supports XID errors is present"
        )

    group = pydcgm.DcgmGroup(dcgmHandle,
                             groupName="test1",
                             groupType=dcgm_structs.DCGM_GROUP_EMPTY)
    group.AddGpu(validDeviceId)
    group.policy.Set(newPolicy)

    callbackQueue = Queue.Queue()
    c_callback = create_c_callback(callbackQueue)
    group.policy.Register(dcgm_structs.DCGM_POLICY_COND_XID,
                          finishCallback=c_callback)

    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = dcgm_fields.DCGM_FI_DEV_XID_ERRORS
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    field.ts = int((time.time() + 60) *
                   1000000.0)  # set the injected data into the future
    field.value.i64 = 16

    ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, validDeviceId,
                                                   field)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    # wait for the the policy manager to call back
    try:
        callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS)
    except Queue.Empty:
        assert False, "Callback never happened"

    # check that the callback occurred with the correct arguments
    assert(dcgm_structs.DCGM_POLICY_COND_XID == callbackData.condition), \
            ("XID error callback was not for a XID error, got: %s" % callbackData.condition)
    assert (16 == callbackData.val.xid.errnum), (
        'Expected XID error 16 but got %s' % callbackData.val.xid.errnum)
Exemplo n.º 4
0
                                % (x, config_values[x].mEccMode, expected_ecc)
        pass

    ret = dcgm_agent.dcgmStatusDestroy(status_handle)
    assert (ret == dcgm_structs.DCGM_ST_OK
            ), "Failed to remove status handler, error: %s" % ret


dcgm_structs._LoadDcgmLibrary()
handle = dcgm_agent.dcgmInit()

devices = dcgm_agent.dcgmGetAllDevices(handle)
validDevices = list()
for x in devices:
    fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, x, [
            dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
        ])
    if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
        validDevices.append(x)

if (len(validDevices) == 0):
    print "Can only run if at least one GPU with ECC is present"
    sys.exit(1)

print "Number of valid devices: %d" % len(validDevices)

groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                     "test1")
statusHandle = dcgm_agent.dcgmStatusCreate()

for device in validDevices:
Exemplo n.º 5
0
newPolicy = dcgm_structs.c_dcgmPolicy_v1()
handle = dcgm_agent.dcgmInit()

newPolicy.version = dcgm_structs.dcgmPolicy_version1
newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED
newPolicy.action = dcgm_structs.DCGM_POLICY_ACTION_GPURESET
newPolicy.validation = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT
newPolicy.parms[2].tag = 1
newPolicy.parms[2].val.llval = 5

# find a GPU that supports retired pages (otherwise internal test will ignore it)
devices = dcgm_agent.dcgmGetAllDevices(handle)
validDevice = -1
for x in devices:
    fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, x, [
            dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
        ])
    if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
        validDevice = x
        break

if (validDevice == -1):
    print("Can only run if at least one GPU with ECC is present")
    sys.exit(1)

groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                     "test1")
statusHandle = dcgm_agent.dcgmStatusCreate()

ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice)
assert (ret == dcgm_structs.DCGM_ST_OK)
Exemplo n.º 6
0
def test_dcgm_configure_ecc_mode(handle, gpuIds):
    test_utils.skip_test("Skipping this test until bug 200377294 is fixed")

    groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                         "test1")

    validDevice = -1
    for x in gpuIds:
        fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
            handle, x, [
                dcgm_fields.DCGM_FI_DEV_ECC_CURRENT,
            ])
        if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
            validDevice = x
            break

    if (validDevice == -1):
        test_utils.skip_test(
            "Can only run if at least one GPU with ECC is present")

    ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice)
    assert (ret == dcgm_structs.DCGM_ST_OK
            ), "Failed to add a device to the group %d. Return %d" % (
                groupId.value, ret)

    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)

    #Create a status handle
    status_handle = dcgm_agent.dcgmStatusCreate()

    ## Get original ECC mode on the device
    config_values = dcgm_agent.dcgmConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        config_values) > 0, "Failed to get configuration using dcgmConfigGet"

    eccmodeOnGroupExisting = config_values[0].mEccMode
    if eccmodeOnGroupExisting == 0:
        eccmodeOnGroupToSet = 1
    else:
        eccmodeOnGroupToSet = 0

    #print eccmodeOnGroupExisting
    #print eccmodeOnGroupToSet

    ## Toggle the ECC mode on the group
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = eccmodeOnGroupToSet
    config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    try:
        ret = dcgm_agent.dcgmConfigSet(handle, groupId, config_values,
                                       status_handle)
    except dcgm_structs.DCGMError as e:
        pass

    errors = helper_get_status_list(status_handle)

    if len(errors) > 0:
        for error in errors:
            if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED:
                test_utils.skip_test(
                    "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d"
                    % (error.fieldId, error.status))
            else:
                test_utils.skip_test(
                    "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d"
                    % (error.fieldId, error.status))

    #Sleep after reset
    time.sleep(2)

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    #Get the current configuration
    config_values = dcgm_agent.dcgmConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        config_values) > 0, "Failed to get configuration using dcgmConfigGet"

    fvs = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, validDevice, [
            dcgm_fields.DCGM_FI_DEV_ECC_PENDING,
            dcgm_fields.DCGM_FI_DEV_ECC_CURRENT
        ])
    if fvs[0].value.i64 != fvs[1].value.i64:
        logger.warning(
            "Pending ECC %d != Current ECC %d for gpuId %d. Box probably needs a reboot"
            % (fvs[0].value.i64, fvs[1].value.i64, validDevice))
    else:
        assert config_values[0].mEccMode == (eccmodeOnGroupToSet), "ECC mode %d different from the set value %d" % \
                                                                   (config_values[0].mEccMode, eccmodeOnGroupToSet)
Exemplo n.º 7
0
def test_dcgm_vgpu_configure_ecc_mode(handle, gpuIds):
    test_utils.skip_test("Skipping this test until bug 200377294 is fixed")

    groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                         "test1")

    validDevice = -1
    for x in gpuIds:
        fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
            handle, x, [dcgm_fields.DCGM_FI_DEV_RETIRED_DBE])
        if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
            validDevice = x
        break

    if (validDevice == -1):
        test_utils.skip_test(
            "Can only run if at least one GPU with ECC is present")

    ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice)
    assert (
        ret == dcgm_structs.DCGM_ST_OK
    ), "Failed to add a device to the group %d. Return %d" % (groupId, ret)

    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)

    #Create a status handle
    status_handle = dcgm_agent.dcgmStatusCreate()

    ## Get original ECC mode on the device
    vgpu_config_values = dcgm_agent_internal.dcgmVgpuConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        vgpu_config_values) > 0, "Failed to work with NULL status handle"

    eccmodeOnGroupExisting = vgpu_config_values[0].mEccMode
    if eccmodeOnGroupExisting == 0:
        eccmodeOnGroupToSet = 1
    else:
        eccmodeOnGroupToSet = 0

    #print eccmodeOnGroupExisting
    #print eccmodeOnGroupToSet

    ## Toggle the ECC mode on the group
    vgpu_config_values = dcgm_structs.c_dcgmDeviceVgpuConfig_v1()
    vgpu_config_values.mEccMode = eccmodeOnGroupToSet
    vgpu_config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    vgpu_config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    vgpu_config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    try:
        ret = dcgm_agent_internal.dcgmVgpuConfigSet(handle, groupId,
                                                    vgpu_config_values,
                                                    status_handle)
    except dcgm_structs.DCGMError as e:
        pass

    errors = helper_get_status_list(status_handle)

    if len(errors) > 0:
        for error in errors:
            if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED:
                test_utils.skip_test(
                    "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d"
                    % (error.fieldId, error.status))
            else:
                test_utils.skip_test(
                    "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d"
                    % (error.fieldId, error.status))

    #Sleep after reset and then apply update for it to occur
    time.sleep(2)

    dcgm_agent.dcgmUpdateAllFields(handle, 1)

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    #Get the current configuration
    config_values = dcgm_agent_internal.dcgmVgpuConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(config_values
               ) > 0, "Failed to get configuration using dcgmiVgpuConfigGet"

    assert config_values[0].mEccMode == (
        eccmodeOnGroupToSet), "ECC mode different from the set value"
Exemplo n.º 8
0
def test_dcgm_injection_agent(handle, gpuIds):
    """
    Verifies that injection works with the agent host engine
    """
    gpuId = gpuIds[0]

    #Make a base value that is good for starters
    fvGood = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    fvGood.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    fvGood.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_CURRENT
    fvGood.status = 0
    fvGood.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    fvGood.ts = get_usec_since_1970()
    fvGood.value.i64 = 1

    fieldInfoBefore = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
        handle, gpuId, fvGood.fieldId)
    countBefore = fieldInfoBefore.numSamples

    #This will throw an exception if it fails
    dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvGood)

    fieldInfoAfter = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
        handle, gpuId, fvGood.fieldId)
    countAfter = fieldInfoAfter.numSamples

    assert countAfter > countBefore, "Expected countAfter %d > countBefore %d after injection" % (
        countAfter, countBefore)

    #Fetch the value we just inserted and verify its attributes are the same
    fvFetched = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, gpuId, [
            fvGood.fieldId,
        ])[0]
    helper_verify_fv_equal(fvFetched, fvGood)

    #Should be able to insert a null timestamp. The agent will just use "now"
    fvAlsoGood = fvGood
    fvAlsoGood.ts = 0
    #This will thrown an exception if it fails
    dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvAlsoGood)

    #Now make some attributes bad and expect an error
    fvBad = fvGood
    fvBad.fieldType = ord(dcgm_fields.DCGM_FT_DOUBLE)
    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)):
        dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad)

    fvGood.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    """ TODO: DCGM-2130 - Restore this test when protobuf is removed
    #Now make some attributes bad and expect an error
    fvBad = fvGood
    fvBad.version = 0
    with test_utils.assert_raises(dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad)

    fvGood.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    """
    fvBad = fvGood
    fvBad.fieldId = dcgm_fields.DCGM_FI_MAX_FIELDS + 100
    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)):
        dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad)