Пример #1
0
def helper_test_mig_cuda_visible_devices_string(handle, gpuIds):
    hierarchy = dcgm_agent.dcgmGetGpuInstanceHierarchy(handle)
    gpuPartOfTest = False

    for i in range(0, hierarchy.count):
        entity = hierarchy.entityList[i]

        isInstance = False
        if entity.entity.entityGroupId == dcgm_fields.DCGM_FE_GPU_I:
            gpuPartOfTest = entity.parent.entityId in gpuIds
            isInstance = True

        if gpuPartOfTest:
            cuda_vis = test_utils.get_cuda_visible_devices_str(
                handle, entity.entity.entityGroupId, entity.entity.entityId)
            assert cuda_vis[:
                            4] == 'MIG-', "Expected the CUDA_VISIBLE_DEVICES string to start with 'MIG-', but found '%s" % (
                                cuda_vis)
            firstSlashIndex = cuda_vis.find('/')
            assert firstSlashIndex != -1, "Expected to find '/' in CUDA_VISIBLE_DEVICES, but didn't: '%s'" % (
                cuda_vis)
            if not isInstance:
                secondSlashIndex = cuda_vis.find('/', firstSlashIndex + 1)
                assert secondSlashIndex != -1, "Expected to find two '/' marks in CUDA_VISIBLE_DEVICES, but didn't: '%s'" % (
                    cuda_vis)
Пример #2
0
def delete_gpu_instances_and_verify(handle, newGpuInstances):
    errMsg = ''
    flags = dcgm_structs.DCGM_MIG_RECONFIG_DELAY_PROCESSING
    delete_gpu_instances(handle, newGpuInstances, flags)

    retries = 20
    gpuInstancesStillHere = newGpuInstances
    while retries > 0:
        hierarchy = dcgm_agent.dcgmGetGpuInstanceHierarchy(handle)
        gpuInstances, gpuCIIds = populate_counts_per_gpu(hierarchy)
        retries = retries - 1
        updated = verify_entries_are_deleted(gpuInstancesStillHere,
                                             gpuInstances)
        if len(updated) == 0:
            errMsg = ''
            break
        else:
            errMsg = "GPU instances '"
            for item in updated:
                errMsg = "%s %s" % (errMsg, item)
            errMsg = "%s' were not deleted successfully" % errMsg
            gpuInstancesStillHere = updated

        time.sleep(1)

    return errMsg
Пример #3
0
def delete_compute_instances_and_verify(handle, newComputeInstances):
    errMsg = ''
    flags = dcgm_structs.DCGM_MIG_RECONFIG_DELAY_PROCESSING
    # Delete the new instances
    for ciId in newComputeInstances[:-1]:
        dcgm_agent.dcgmDeleteMigEntity(handle, dcgm_fields.DCGM_FE_GPU_CI,
                                       ciId, flags)

    # don't block processing the reconfigure with the last one
    dcgm_agent.dcgmDeleteMigEntity(handle, dcgm_fields.DCGM_FE_GPU_CI,
                                   newComputeInstances[-1], 0)

    # verify that the compute instances disappear
    retries = 20
    cisStillHere = newComputeInstances
    while retries > 0:
        hierarchy = dcgm_agent.dcgmGetGpuInstanceHierarchy(handle)
        gpuInstances, gpuCIIds = populate_counts_per_gpu(hierarchy)
        retries = retries - 1
        updated = verify_entries_are_deleted(cisStillHere, gpuCIIds)
        if len(updated) == 0:
            errMsg = ''
            break
        else:
            errMsg = "Compute instances '"
            for item in updated:
                errMsg = "%s %s" % (errMsg, item)
            errMsg = "%s' were not deleted successfully" % errMsg
            cisStillHere = updated

        time.sleep(1)

    return errMsg
Пример #4
0
def create_mig_entities_and_verify(handle, gpuIds, instanceCreateCount,
                                   minInstanceCreateCount):
    # get mig hierarchy
    hierarchy = dcgm_agent.dcgmGetGpuInstanceHierarchy(handle)
    oGpuInstances, oGpuCIIds = populate_counts_per_gpu(hierarchy)

    numInstancesCreated = create_small_mig_objects(handle, gpuIds, 3)
    if numInstancesCreated < minInstanceCreateCount:
        test_utils.skip_test("Cannot create any GPU instances, skipping test.")

    # Make sure the new instances appear
    newGpuInstances, newComputeInstances, errMsg = verifyMigUpdates(
        handle, oGpuInstances, oGpuCIIds, numInstancesCreated, 0)
    assert errMsg == '', errMsg

    # Create new compute instances
    flags = dcgm_structs.DCGM_MIG_RECONFIG_DELAY_PROCESSING
    numCIsCreated = 0
    try:
        for instanceId in newGpuInstances[:-1]:
            dcgm_agent.dcgmCreateMigEntity(handle, instanceId, dcgm_structs.DcgmMigProfileComputeInstanceSlice1, \
                   dcgm_structs.DcgmMigCreateComputeInstance, flags)
            numCIsCreated = numCIsCreated + 1

        # For the last one, send a flag to ask hostengine to process the reconfiguring
        dcgm_agent.dcgmCreateMigEntity(handle, newGpuInstances[-1], dcgm_structs.DcgmMigProfileComputeInstanceSlice1, \
               dcgm_structs.DcgmMigCreateComputeInstance, 0)
        numCIsCreated = numCIsCreated + 1
    except dcgm_structs.dcgmExceptionClass(
            dcgm_structs.DCGM_ST_INSUFFICIENT_RESOURCES) as e:
        delete_gpu_instances_no_fail(handle, newGpuInstances, flags)
        test_utils.skip_test("Insufficient resources to run this test")

    # Verify the new compute instances have appeared
    newGpuInstances, newComputeInstances, errMsg = verifyMigUpdates(
        handle, oGpuInstances, oGpuCIIds, numInstancesCreated, numCIsCreated)
    if errMsg != '':
        delete_gpu_instances_no_fail(handle, newGpuInstances, flags)

    assert errMsg == '', errMsg

    return newGpuInstances, newComputeInstances
Пример #5
0
def ensure_instance_ids(handle, gpuId, minInstances, minCIs):
    instanceMap = {}
    ciMap = {}
    legalInstances = []

    legalGpu = False
    hierarchy = dcgm_agent.dcgmGetGpuInstanceHierarchy(handle)
    for i in range(0, hierarchy.count):
        entity = hierarchy.entityList[i]
        if entity.entity.entityGroupId == dcgm_fields.DCGM_FE_GPU_I:
            if entity.parent.entityId == gpuId:
                legalGpu = True
                instanceMap[entity.entity.entityId] = entity.parent.entityId
            else:
                legalGpu = False
        elif entity.entity.entityGroupId == dcgm_fields.DCGM_FE_GPU_CI and legalGpu:
            ciMap[entity.entity.entityId] = entity.parent.entityId
            legalInstances.append(entity.parent.entityId)

    if hierarchy.count == 0:
        logger.info("There were no MIG instances configured on this host")

    instancesNeeded = minInstances - len(instanceMap)
    cisNeeded = minCIs - len(ciMap)

    fakeInstanceMap = create_fake_gpu_instances(handle, gpuId, instancesNeeded)
    for fakeInstance in fakeInstanceMap:
        legalInstances.append(fakeInstance)

    instanceMap.update(fakeInstanceMap)

    fakeCIMap = create_fake_compute_instances(handle, legalInstances,
                                              cisNeeded)
    ciMap.update(fakeCIMap)

    return instanceMap, ciMap
Пример #6
0
def helper_test_mig_reconfigure(handle, gpuIds):
    # get mig hierarchy
    hierarchy = dcgm_agent.dcgmGetGpuInstanceHierarchy(handle)
    oGpuInstances, oGpuCIIds = populate_counts_per_gpu(hierarchy)

    numInstancesCreated = create_small_mig_objects(handle, gpuIds)
    if numInstancesCreated == 0:
        test_utils.skip_test("Cannot create any GPU instances, skipping test.")

    # Make sure the new instances appear
    newGpuInstances, newComputeInstances, errMsg = verifyMigUpdates(
        handle, oGpuInstances, oGpuCIIds, numInstancesCreated, 0)
    assert errMsg == '', errMsg

    # Create new compute instances
    flags = dcgm_structs.DCGM_MIG_RECONFIG_DELAY_PROCESSING
    numCIsCreated = 0
    try:
        for instanceId in newGpuInstances[:-1]:
            dcgm_agent.dcgmCreateMigEntity(handle, instanceId, dcgm_structs.DcgmMigProfileComputeInstanceSlice1, \
                   dcgm_structs.DcgmMigCreateComputeInstance, flags)
            numCIsCreated = numCIsCreated + 1

        # For the last one, send a flag to ask hostengine to process the reconfiguring
        dcgm_agent.dcgmCreateMigEntity(handle, newGpuInstances[-1], dcgm_structs.DcgmMigProfileComputeInstanceSlice1, \
               dcgm_structs.DcgmMigCreateComputeInstance, 0)
        numCIsCreated = numCIsCreated + 1
    except dcgm_structs.dcgmExceptionClass(
            dcgm_structs.DCGM_ST_INSUFFICIENT_RESOURCES) as e:
        delete_gpu_instances_no_fail(handle, newGpuInstances, flags)
        test_utils.skip_test("Insufficient resources to run this test")

    # Verify the new compute instances have appeared
    newGpuInstances, newComputeInstances, errMsg = verifyMigUpdates(
        handle, oGpuInstances, oGpuCIIds, numInstancesCreated, numCIsCreated)
    if errMsg != '':
        delete_gpu_instances_no_fail(handle, newGpuInstances, flags)

    assert errMsg == '', errMsg

    # Delete the new instances
    for ciId in newComputeInstances[:-1]:
        dcgm_agent.dcgmDeleteMigEntity(handle, dcgm_fields.DCGM_FE_GPU_CI,
                                       ciId, flags)

    # don't block processing the reconfigure with the last one
    dcgm_agent.dcgmDeleteMigEntity(handle, dcgm_fields.DCGM_FE_GPU_CI,
                                   newComputeInstances[-1], 0)

    # verify that the compute instances disappear
    retries = 20
    cisStillHere = newComputeInstances
    while retries > 0:
        hierarchy = dcgm_agent.dcgmGetGpuInstanceHierarchy(handle)
        gpuInstances, gpuCIIds = populate_counts_per_gpu(hierarchy)
        retries = retries - 1
        updated = verify_entries_are_deleted(cisStillHere, gpuCIIds)
        if len(updated) == 0:
            errMsg = ''
            break
        else:
            errMsg = "Compute instances '"
            for item in updated:
                errMsg = "%s %s" % (errMsg, item)
            errMsg = "%s' were not deleted successfully"
            cisStillHere = updated

        time.sleep(1)

    # Save this and attempt to cleanup the rest even though we failed here
    ciFailMsg = ''
    if errMsg != '':
        ciFailMsg = errMsg
        logger.warning(
            "The compute instances didn't clean up correctly, but we'll attempt to clean up anyway"
        )

    delete_gpu_instances(handle, newGpuInstances, flags)

    retries = 20
    gpuInstancesStillHere = newGpuInstances
    while retries > 0:
        hierarchy = dcgm_agent.dcgmGetGpuInstanceHierarchy(handle)
        gpuInstances, gpuCIIds = populate_counts_per_gpu(hierarchy)
        retries = retries - 1
        updated = verify_entries_are_deleted(gpuInstancesStillHere,
                                             gpuInstances)
        if len(updated) == 0:
            errMsg = ''
            break
        else:
            errMsg = "GPU instances '"
            for item in updated:
                errMsg = "%s %s" % (errMsg, item)
            errMsg = "%s' were not deleted successfully"
            gpuInstancesStillHere = updated

        time.sleep(1)

    assert ciFailMsg == '', ciFailMsg
    assert errMsg == '', errMsg
Пример #7
0
def verifyMigUpdates(handle,
                     oGpuInstances,
                     oGpuCIIds,
                     numInstancesCreated,
                     numCIsCreated,
                     retries=19):
    newGpuInstances = []
    newComputeInstances = []

    if numInstancesCreated == 0 and numCIsCreated == 0:
        return newGpuInstances, newComputeInstances, ''

    errMsg = ''
    while retries >= 0:
        newGpuInstances = []
        newComputeInstances = []
        hierarchy = dcgm_agent.dcgmGetGpuInstanceHierarchy(handle)
        gpuInstances, gpuCIIds = populate_counts_per_gpu(hierarchy)

        # Add any new instances to the map
        for key in gpuInstances:
            if key in oGpuInstances:
                # Compare lists
                for instanceId in gpuInstances[key]:
                    if instanceId not in oGpuInstances[key]:
                        newGpuInstances.append(instanceId)
            else:
                # Add the entire list to the new instances
                for instanceId in gpuInstances[key]:
                    newGpuInstances.append(instanceId)

        # Add any new compute instances to the map
        for key in gpuCIIds:
            if key in oGpuCIIds:
                # Compare lists
                for ciId in gpuCIIds[key]:
                    if ciId not in oGpuCIIds[key]:
                        newComputeInstances.append(ciId)
            else:
                # Add the entire list to the new compute instances
                for ciId in gpuCIIds[key]:
                    newComputeInstances.append(ciId)

        if len(newGpuInstances) >= numInstancesCreated and len(
                newComputeInstances) >= numCIsCreated:
            errMsg = ''
            break
        elif len(newGpuInstances) < numInstancesCreated and len(
                newComputeInstances) < numCIsCreated:
            errMsg = 'Expected %d new GPU instances and %d new compute instances but only found %d and %d' % \
                     (numInstancesCreated, numCIsCreated, len(newGpuInstances), len(newComputeInstances))
        elif len(newGpuInstances) < numInstancesCreated:
            errMsg = "Expected %d new GPU instances but only found %d" % (
                numInstancesCreated, len(newGpuInstances))
        else:
            errMsg = "Expected %d new compute instances but only found %d" % (
                numCIsCreated, len(newComputeInstances))

        retries = retries - 1
        time.sleep(1)

    return newGpuInstances, newComputeInstances, errMsg
Пример #8
0
def ensure_instance_ids(handle, gpuId, minInstances, minCIs):
    hierarchy = dcgm_agent.dcgmGetGpuInstanceHierarchy(handle)
    legalGpu = False
    instanceMap = {}
    ciMap = {}
    legalInstances = []

    for i in range(0, hierarchy.count):
        entity = hierarchy.entityList[i]
        if entity.entity.entityGroupId == dcgm_fields.DCGM_FE_GPU_I:
            if entity.parent.entityId == gpuId:
                legalGpu = True
                instanceMap[entity.entity.entityId] = entity.parent.entityId
            else:
                legalGpu = False
        elif entity.entity.entityGroupId == dcgm_fields.DCGM_FE_GPU_CI and legalGpu:
            ciMap[entity.entity.entityId] = entity.parent.entityId
            legalInstances.append(entity.parent.entityId)

    instancesNeeded = minInstances - len(instanceMap)
    cisNeeded = minCIs - len(ciMap)

    cfe = dcgm_structs_internal.c_dcgmCreateFakeEntities_v2()
    cfe.numToCreate = 0

    if instancesNeeded > 0:
        for i in range(0, instancesNeeded):
            cfe.entityList[
                cfe.numToCreate].parent.entityGroupId = dcgm_fields.DCGM_FE_GPU
            cfe.entityList[cfe.numToCreate].parent.entityId = gpuId
            cfe.entityList[
                cfe.
                numToCreate].entity.entityGroupId = dcgm_fields.DCGM_FE_GPU_I
            cfe.numToCreate += 1

        # Create the instances first so we can control which GPU the compute instances are placed on
        updated = dcgm_agent_internal.dcgmCreateFakeEntities(handle, cfe)
        for i in range(0, updated.numToCreate):
            if updated.entityList[
                    i].entity.entityGroupId == dcgm_fields.DCGM_FE_GPU_I:
                instanceMap[updated.entityList[i].entity.
                            entityId] = updated.entityList[i].parent.entityId
                legalInstances.append(updated.entityList[i].entity.entityId)

    if cisNeeded > 0:
        cfe = dcgm_structs_internal.c_dcgmCreateFakeEntities_v2()
        for i in range(0, cisNeeded):
            cfe.entityList[
                cfe.
                numToCreate].parent.entityGroupId = dcgm_fields.DCGM_FE_GPU_I
            instanceIndex = i
            if instanceIndex > len(legalInstances):
                instanceIndex = 0
            cfe.entityList[cfe.numToCreate].parent.entityId = legalInstances[
                instanceIndex]
            cfe.entityList[
                cfe.
                numToCreate].entity.entityGroupId = dcgm_fields.DCGM_FE_GPU_CI
            cfe.numToCreate += 1

        updated = dcgm_agent_internal.dcgmCreateFakeEntities(handle, cfe)
        for i in range(0, updated.numToCreate):
            if updated.entityList[
                    i].entity.entityGroupId == dcgm_fields.DCGM_FE_GPU_CI:
                ciMap[updated.entityList[i].entity.
                      entityId] = updated.entityList[i].parent.entityId

    return instanceMap, ciMap