Пример #1
0
def test_dcgm_policy_inject_retiredpages_standalone(handle, gpuIds):
    """ 
    Verifies that we can inject an error into the retired pages counters and receive a callback
    """
    newPolicy = dcgm_structs.c_dcgmPolicy_v1()

    newPolicy.version = dcgm_structs.dcgmPolicy_version1
    newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED
    newPolicy.parms[2].tag = 1
    newPolicy.parms[2].val.llval = 5

    # find a GPU that supports ECC and retired pages (otherwise internal test will ignore it)
    dcgmHandle = pydcgm.DcgmHandle(handle)
    dcgmSystem = dcgmHandle.GetSystem()
    group = dcgmSystem.GetGroupWithGpuIds("test1", gpuIds)

    group.policy.Set(newPolicy)

    callbackQueue = Queue.Queue()
    c_callback = create_c_callback(callbackQueue)
    group.policy.Register(dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED,
                          finishCallback=c_callback)

    # inject an error into ECC
    numPages = 10
    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = dcgm_fields.DCGM_FI_DEV_RETIRED_DBE
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    field.ts = int((time.time() + 60) *
                   1000000.0)  # set the injected data into the future
    field.value.i64 = numPages

    ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    #inject a SBE too so that the health check code gets past its internal checks
    field.fieldId = dcgm_fields.DCGM_FI_DEV_RETIRED_SBE

    ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    # wait for the the policy manager to call back
    try:
        callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS)
    except Queue.Empty:
        assert False, "Callback never happened"

    # check that the callback occurred with the correct arguments
    assert(dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED == callbackData.condition), \
            ("error callback was not for a retired pages, got: %s" % callbackData.condition)
    assert(numPages == callbackData.val.mpr.dbepages), \
            'Expected %s errors but got %s' % (numPages, callbackData.val.mpr.dbepages)
Пример #2
0
def test_dcgm_policy_inject_nvlinkerror_standalone(handle, gpuIds):
    """ 
    Verifies that we can inject an error into the NVLINK error and receive a callback
    """
    newPolicy = dcgm_structs.c_dcgmPolicy_v1()

    newPolicy.version = dcgm_structs.dcgmPolicy_version1
    newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_NVLINK
    newPolicy.parms[5].tag = 0
    newPolicy.parms[5].val.boolean = True

    # find a GPU that supports nvlink (otherwise internal test will ignore it)
    dcgmHandle = pydcgm.DcgmHandle(handle)
    dcgmSystem = dcgmHandle.GetSystem()
    group = dcgmSystem.GetGroupWithGpuIds('test1', gpuIds)
    group.policy.Set(newPolicy)

    callbackQueue = Queue.Queue()
    c_callback = create_c_callback(callbackQueue)
    group.policy.Register(dcgm_structs.DCGM_POLICY_COND_NVLINK,
                          finishCallback=c_callback)

    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    field.ts = int((time.time() + 60) *
                   1000000.0)  # set the injected data into the future
    field.value.i64 = 1

    ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    # wait for the the policy manager to call back
    try:
        callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS)
    except Queue.Empty:
        assert False, "Callback never happened"

    # check that the callback occurred with the correct arguments
    assert(dcgm_structs.DCGM_POLICY_COND_NVLINK == callbackData.condition), \
            ("NVLINK error callback was not for a NVLINK error, got: %s" % callbackData.condition)
    assert(dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL == callbackData.val.nvlink.fieldId), \
            ("Expected 130 fieldId but got %s" % callbackData.val.nvlink.fieldId)
    assert (
        1 == callbackData.val.nvlink.counter
    ), 'Expected 1 PCI error but got %s' % callbackData.val.nvlink.counter
Пример #3
0
def helper_dcgm_policy_inject_eccerror(handle, gpuIds):
    """ 
    Verifies that we can inject an error into the ECC counters and receive a callback
    """
    newPolicy = dcgm_structs.c_dcgmPolicy_v1()
    newPolicy.version = dcgm_structs.dcgmPolicy_version1
    newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_DBE
    newPolicy.parms[0].tag = 0
    newPolicy.parms[0].val.boolean = True

    dcgmHandle = pydcgm.DcgmHandle(handle)
    dcgmSystem = dcgmHandle.GetSystem()
    group = dcgmSystem.GetGroupWithGpuIds("test1", gpuIds)
    group.policy.Set(newPolicy)

    # the order of the callbacks will change once implementation is complete
    callbackQueue = Queue.Queue()
    c_callback = create_c_callback(callbackQueue)
    group.policy.Register(dcgm_structs.DCGM_POLICY_COND_DBE, c_callback, None)

    # inject an error into ECC
    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_DEV
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    field.ts = int((time.time() + 60) *
                   1000000.0)  # set the injected data into the future
    field.value.i64 = 1
    logger.debug("injecting %s for gpuId %d" % (str(field), gpuIds[0]))

    ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    # wait for the the policy manager to call back
    try:
        callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS)
    except Queue.Empty:
        assert False, "Callback never happened"

    # check that the callback occurred with the correct arguments
    assert(dcgm_structs.DCGM_POLICY_COND_DBE == callbackData.condition), \
            ("error callback was not for a DBE error, got: %s" % callbackData.condition)
    assert (
        1 == callbackData.val.dbe.numerrors
    ), 'Expected 1 DBE error but got %s' % callbackData.val.dbe.numerrors
    assert(dcgm_structs.c_dcgmPolicyConditionDbe_t.LOCATIONS['DEVICE'] == callbackData.val.dbe.location), \
        'got: %s' % callbackData.val.dbe.location
Пример #4
0
def helper_dcgm_policy_inject_pcierror(handle, gpuIds):
    """ 
    Verifies that we can inject an error into the PCI counters and receive a callback
    """
    newPolicy = dcgm_structs.c_dcgmPolicy_v1()

    newPolicy.version = dcgm_structs.dcgmPolicy_version1
    newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_PCI
    newPolicy.parms[1].tag = 1
    newPolicy.parms[1].val.llval = 0

    gpuId = gpuIds[0]

    group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle),
                             groupName="test1",
                             groupType=dcgm_structs.DCGM_GROUP_EMPTY)
    group.AddGpu(gpuId)
    group.policy.Set(newPolicy)

    callbackQueue = Queue.Queue()
    c_callback = create_c_callback(callbackQueue)
    group.policy.Register(dcgm_structs.DCGM_POLICY_COND_PCI,
                          finishCallback=c_callback)

    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    field.ts = int((time.time() + 60) *
                   1000000.0)  # set the injected data into the future
    field.value.i64 = 1

    ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, field)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    # wait for the the policy manager to call back
    try:
        callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS)
    except Queue.Empty:
        assert False, "Callback never happened"

    # check that the callback occurred with the correct arguments
    assert(dcgm_structs.DCGM_POLICY_COND_PCI == callbackData.condition), \
            ("PCI error callback was not for a PCI error, got: %s" % callbackData.condition)
    assert (1 == callbackData.val.pci.counter
            ), 'Expected 1 PCI error but got %s' % callbackData.val.pci.counter
Пример #5
0
def test_dcgm_policy_set_get_violation_policy_standalone(handle, gpuIds):
    """ 
    Verifies that set and get violation policy work
    """
    dcgmHandle = pydcgm.DcgmHandle(handle)
    dcgmSystem = dcgmHandle.GetSystem()
    group = dcgmSystem.GetGroupWithGpuIds("test1", gpuIds)

    newPolicy = dcgm_structs.c_dcgmPolicy_v1()
    newPolicy.version = dcgm_structs.dcgmPolicy_version1
    newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_DBE
    newPolicy.parms[0].tag = 0
    newPolicy.parms[0].val.boolean = True

    group.policy.Set(newPolicy)
    policies = group.policy.Get()

    _assert_policies_equal(policies[0], newPolicy)
Пример #6
0
def vtDcgmPolicyGet(dcgm_handle, group_id, count, status_handle, versionTest):
    fn = dcgmFP("dcgmPolicyGet")
    policy_array = count * dcgm_structs.c_dcgmPolicy_v1

    c_policy_values = policy_array()

    policy = dcgm_structs.c_dcgmPolicy_v1()
    policy.version = dcgm_structs.make_dcgm_version(policy, 1)
    logger.debug("Structure version: %d" % policy.version)

    policyCallback = dcgm_structs.c_dcgmPolicyCallbackResponse_v1()
    policyCallback.version = dcgm_structs.make_dcgm_version(policyCallback, 1)
    logger.debug("Structure version: %d" % policyCallback.version)

    for index in range(0, count):
        c_policy_values[index].version = versionTest

    ret = fn(dcgm_handle, group_id, count, c_policy_values, status_handle)
    dcgm_structs._dcgmCheckReturn(ret)
    return c_policy_values[0:count]
Пример #7
0
def helper_test_dcgm_policy_inject_xiderror(handle, gpuIds):
    """ 
    Verifies that we can inject an XID error and receive a callback
    """
    newPolicy = dcgm_structs.c_dcgmPolicy_v1()

    newPolicy.version = dcgm_structs.dcgmPolicy_version1
    newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_XID
    newPolicy.parms[6].tag = 0
    newPolicy.parms[6].val.boolean = True

    dcgmHandle = pydcgm.DcgmHandle(handle)
    validDeviceId = -1
    devices = gpuIds
    for x in devices:
        fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
            handle, x, [
                dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
            ])
        if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
            validDeviceId = x
            break
    if (validDeviceId == -1):
        test_utils.skip_test(
            "Can only run if at least one GPU that supports XID errors is present"
        )

    group = pydcgm.DcgmGroup(dcgmHandle,
                             groupName="test1",
                             groupType=dcgm_structs.DCGM_GROUP_EMPTY)
    group.AddGpu(validDeviceId)
    group.policy.Set(newPolicy)

    callbackQueue = Queue.Queue()
    c_callback = create_c_callback(callbackQueue)
    group.policy.Register(dcgm_structs.DCGM_POLICY_COND_XID,
                          finishCallback=c_callback)

    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = dcgm_fields.DCGM_FI_DEV_XID_ERRORS
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    field.ts = int((time.time() + 60) *
                   1000000.0)  # set the injected data into the future
    field.value.i64 = 16

    ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, validDeviceId,
                                                   field)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    # wait for the the policy manager to call back
    try:
        callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS)
    except Queue.Empty:
        assert False, "Callback never happened"

    # check that the callback occurred with the correct arguments
    assert(dcgm_structs.DCGM_POLICY_COND_XID == callbackData.condition), \
            ("XID error callback was not for a XID error, got: %s" % callbackData.condition)
    assert (16 == callbackData.val.xid.errnum), (
        'Expected XID error 16 but got %s' % callbackData.val.xid.errnum)
Пример #8
0
    ret = dcgm_agent.dcgmStatusDestroy(status_handle)
    assert (ret == dcgm_structs.DCGM_ST_OK
            ), "Failed to remove status handler, error: %s" % ret


def callback_function(data):
    global callbackCalled
    callbackCalled = True


c_callback = C_FUNC(callback_function)

dcgm_structs._LoadDcgmLibrary()

newPolicy = dcgm_structs.c_dcgmPolicy_v1()
handle = dcgm_agent.dcgmInit()

newPolicy.version = dcgm_structs.dcgmPolicy_version1
newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED
newPolicy.action = dcgm_structs.DCGM_POLICY_ACTION_GPURESET
newPolicy.validation = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT
newPolicy.parms[2].tag = 1
newPolicy.parms[2].val.llval = 5

# find a GPU that supports retired pages (otherwise internal test will ignore it)
devices = dcgm_agent.dcgmGetAllDevices(handle)
validDevice = -1
for x in devices:
    fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, x, [