def test_create_servers_with_specific_type(self): # Regenerate the PCI addresses so both pGPUs now support nvidia-12 connection = self.computes[ self.compute1.host].driver._host.get_connection() connection.pci_info = fakelibvirt.HostPCIDevicesInfo( num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2, multiple_gpu_types=True) # Make a restart to update the Resource Providers self.compute1 = self.restart_compute_service(self.compute1) pgpu1_rp_uuid = self._get_provider_uuid_by_name( self.compute1.host + '_' + fakelibvirt.PGPU1_PCI_ADDR) pgpu2_rp_uuid = self._get_provider_uuid_by_name( self.compute1.host + '_' + fakelibvirt.PGPU2_PCI_ADDR) pgpu1_inventory = self._get_provider_inventory(pgpu1_rp_uuid) self.assertEqual(16, pgpu1_inventory[orc.VGPU]['total']) pgpu2_inventory = self._get_provider_inventory(pgpu2_rp_uuid) self.assertEqual(8, pgpu2_inventory[orc.VGPU]['total']) # Attach traits to the pGPU RPs self._set_provider_traits(pgpu1_rp_uuid, ['CUSTOM_NVIDIA_11']) self._set_provider_traits(pgpu2_rp_uuid, ['CUSTOM_NVIDIA_12']) expected = {'CUSTOM_NVIDIA_11': fakelibvirt.PGPU1_PCI_ADDR, 'CUSTOM_NVIDIA_12': fakelibvirt.PGPU2_PCI_ADDR} for trait in expected.keys(): # Add a trait to the flavor extra_spec = {"resources:VGPU": "1", "trait:%s" % trait: "required"} flavor = self._create_flavor(extra_spec=extra_spec) # Use the new flavor for booting server = self._create_server( image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6', flavor_id=flavor, host=self.compute1.host, expected_state='ACTIVE') # Get the instance we just created inst = objects.Instance.get_by_uuid(self.context, server['id']) # Get the mdevs that were allocated for this instance, we should # only have one mdevs = self.compute1.driver._get_all_assigned_mediated_devices( inst) self.assertEqual(1, len(mdevs)) # It's a dict of mdev_uuid/instance_uuid pairs, we only care about # the keys mdevs = list(mdevs.keys()) # Now get the detailed information about this single mdev mdev_info = self.compute1.driver._get_mediated_device_information( libvirt_utils.mdev_uuid2name(mdevs[0])) # We can be deterministic : since we asked for a specific type, # we know which pGPU we landed. self.assertEqual(expected[trait], mdev_info['parent'])
def _create_mdev(self, physical_device, mdev_type, uuid=None): # We need to fake the newly created sysfs object by adding a new # FakeMdevDevice in the existing persisted Connection object so # when asking to get the existing mdevs, we would see it. if not uuid: uuid = uuidutils.generate_uuid() mdev_name = libvirt_utils.mdev_uuid2name(uuid) libvirt_parent = self.pci2libvirt_address(physical_device) self.fake_connection.mdev_info.devices.update({ mdev_name: fakelibvirt.FakeMdevDevice(dev_name=mdev_name, type_id=mdev_type, parent=libvirt_parent) }) return uuid
def assert_vgpu_usage_for_compute(self, compute, expected): total_usage = 0 # We only want to get mdevs that are assigned to instances mdevs = compute.driver._get_all_assigned_mediated_devices() for mdev in mdevs: mdev_name = libvirt_utils.mdev_uuid2name(mdev) mdev_info = compute.driver._get_mediated_device_information( mdev_name) parent_name = mdev_info['parent'] parent_rp_name = compute.host + '_' + parent_name parent_rp_uuid = self._get_provider_uuid_by_name(parent_rp_name) parent_usage = self._get_provider_usages(parent_rp_uuid) if orc.VGPU in parent_usage: total_usage += parent_usage[orc.VGPU] self.assertEqual(expected, len(mdevs)) self.assertEqual(expected, total_usage)
def assert_vgpu_usage_for_compute(self, compute, expected): total_usages = collections.defaultdict(int) # We only want to get mdevs that are assigned to instances mdevs = compute.driver._get_all_assigned_mediated_devices() for mdev in mdevs: mdev_name = libvirt_utils.mdev_uuid2name(mdev) mdev_info = compute.driver._get_mediated_device_information( mdev_name) parent_name = mdev_info['parent'] parent_rp_name = compute.host + '_' + parent_name parent_rp_uuid = self._get_provider_uuid_by_name(parent_rp_name) parent_usage = self._get_provider_usages(parent_rp_uuid) if orc.VGPU in parent_usage and parent_rp_name not in total_usages: # We only set the total amount if we didn't had it already total_usages[parent_rp_name] = parent_usage[orc.VGPU] self.assertEqual(expected, len(mdevs)) self.assertEqual(expected, sum(total_usages[k] for k in total_usages))
def _create_mdev(self, physical_device, mdev_type, uuid=None): # We need to fake the newly created sysfs object by adding a new # FakeMdevDevice in the existing persisted Connection object so # when asking to get the existing mdevs, we would see it. if not uuid: uuid = uuidutils.generate_uuid() mdev_name = libvirt_utils.mdev_uuid2name(uuid) libvirt_parent = self.pci2libvirt_address(physical_device) # Here, we get the right compute thanks by the self.current_host that # was modified just before connection = self.computes[ self._current_host].driver._host.get_connection() connection.mdev_info.devices.update( {mdev_name: fakelibvirt.FakeMdevDevice(dev_name=mdev_name, type_id=mdev_type, parent=libvirt_parent)}) return uuid
def assert_mdev_usage(self, compute, expected_amount, instance=None, expected_rc=orc.VGPU, expected_rp_name=None): """Verify the allocations for either a whole compute or just a specific instance. :param compute: the internal compute object :param expected_amount: the expected amount of allocations :param instance: if not None, a specific Instance to lookup instead of the whole compute allocations. :param expected_rc: the expected resource class :param expected_rp_name: the expected resource provider name if an instance is provided. """ total_usages = collections.defaultdict(int) # We only want to get mdevs that are assigned to either all the # instances or just one. mdevs = compute.driver._get_all_assigned_mediated_devices(instance) for mdev in mdevs: mdev_name = libvirt_utils.mdev_uuid2name(mdev) mdev_info = compute.driver._get_mediated_device_information( mdev_name) parent_name = mdev_info['parent'] parent_rp_name = compute.host + '_' + parent_name parent_rp_uuid = self._get_provider_uuid_by_name(parent_rp_name) parent_usage = self._get_provider_usages(parent_rp_uuid) if (expected_rc in parent_usage and parent_rp_name not in total_usages): # We only set the total amount if we didn't had it already total_usages[parent_rp_name] = parent_usage[expected_rc] if expected_rp_name and instance is not None: # If this is for an instance, all the mdevs should be in the # same RP. self.assertEqual(expected_rp_name, parent_rp_name) self.assertEqual(expected_amount, len(mdevs)) self.assertEqual(expected_amount, sum(total_usages[k] for k in total_usages))
def mdev_uuid2name(mdev_uuid): return libvirt_utils.mdev_uuid2name(mdev_uuid)
def mdev_uuid2name(mdev_uuid): return libvirt_utils.mdev_uuid2name(mdev_uuid)
def test_create_servers_with_vgpu(self): """Verify that vgpu reshape works with libvirt driver 1) create two servers with an old tree where the VGPU resource is on the compute provider 2) trigger a reshape 3) check that the allocations of the servers are still valid 4) create another server now against the new tree """ # NOTE(gibi): We cannot simply ask the virt driver to create an old # RP tree with vgpu on the root RP as that code path does not exist # any more. So we have to hack a "bit". We will create a compute # service without vgpu support to have the compute RP ready then we # manually add the VGPU resources to that RP in placement. Also we make # sure that during the instance claim the virt driver does not detect # the old tree as that would be a bad time for reshape. Later when the # compute service is restarted the driver will do the reshape. fake_connection = self._get_connection( # We need more RAM or the 3rd server won't be created host_info=fakelibvirt.HostInfo(kB_mem=8192), libvirt_version=self.MIN_LIBVIRT_MDEV_SUPPORT, mdev_info=fakelibvirt.HostMdevDevicesInfo()) self.mock_conn.return_value = fake_connection # start a compute with vgpu support disabled so the driver will # ignore the content of the above HostMdevDeviceInfo self.flags(enabled_vgpu_types='', group='devices') self.compute = self.start_service('compute', host='compute1') # create the VGPU resource in placement manually compute_rp_uuid = self.placement_api.get( '/resource_providers?name=compute1' ).body['resource_providers'][0]['uuid'] inventories = self.placement_api.get( '/resource_providers/%s/inventories' % compute_rp_uuid).body inventories['inventories']['VGPU'] = { 'allocation_ratio': 1.0, 'max_unit': 3, 'min_unit': 1, 'reserved': 0, 'step_size': 1, 'total': 3 } self.placement_api.put( '/resource_providers/%s/inventories' % compute_rp_uuid, inventories) # now we boot two servers with vgpu extra_spec = {"resources:VGPU": 1} flavor_id = self._create_flavor(extra_spec=extra_spec) server_req = self._build_server(flavor_id) # NOTE(gibi): during instance_claim() there is a # driver.update_provider_tree() call that would detect the old tree and # would fail as this is not a good time to reshape. To avoid that we # temporarily mock update_provider_tree here. with mock.patch('nova.virt.libvirt.driver.LibvirtDriver.' 'update_provider_tree'): created_server1 = self.api.post_server({'server': server_req}) server1 = self._wait_for_state_change(created_server1, 'ACTIVE') created_server2 = self.api.post_server({'server': server_req}) server2 = self._wait_for_state_change(created_server2, 'ACTIVE') # Determine which device is associated with which instance # { inst.uuid: pgpu_name } inst_to_pgpu = {} ctx = context.get_admin_context() for server in (server1, server2): inst = objects.Instance.get_by_uuid(ctx, server['id']) mdevs = list( self.compute.driver._get_all_assigned_mediated_devices(inst)) self.assertEqual(1, len(mdevs)) mdev_uuid = mdevs[0] mdev_info = self.compute.driver._get_mediated_device_information( utils.mdev_uuid2name(mdev_uuid)) inst_to_pgpu[inst.uuid] = mdev_info['parent'] # The VGPUs should have come from different pGPUs self.assertNotEqual(*list(inst_to_pgpu.values())) # verify that the inventory, usages and allocation are correct before # the reshape compute_inventory = self.placement_api.get( '/resource_providers/%s/inventories' % compute_rp_uuid).body['inventories'] self.assertEqual(3, compute_inventory['VGPU']['total']) compute_usages = self.placement_api.get( '/resource_providers/%s/usages' % compute_rp_uuid).body['usages'] self.assertEqual(2, compute_usages['VGPU']) for server in (server1, server2): allocations = self.placement_api.get( '/allocations/%s' % server['id']).body['allocations'] # the flavor has disk=10 and ephemeral=10 self.assertEqual( { 'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2, 'VGPU': 1 }, allocations[compute_rp_uuid]['resources']) # enabled vgpu support self.flags(enabled_vgpu_types=fakelibvirt.NVIDIA_11_VGPU_TYPE, group='devices') # restart compute which will trigger a reshape self.restart_compute_service(self.compute) # verify that the inventory, usages and allocation are correct after # the reshape compute_inventory = self.placement_api.get( '/resource_providers/%s/inventories' % compute_rp_uuid).body['inventories'] self.assertNotIn('VGPU', compute_inventory) # NOTE(sbauza): The two instances will use two different pGPUs # That said, we need to check all the pGPU inventories for knowing # which ones are used. usages = {} pgpu_uuid_to_name = {} for pci_device in [ fakelibvirt.PGPU1_PCI_ADDR, fakelibvirt.PGPU2_PCI_ADDR, fakelibvirt.PGPU3_PCI_ADDR ]: gpu_rp_uuid = self.placement_api.get( '/resource_providers?name=compute1_%s' % pci_device).body['resource_providers'][0]['uuid'] pgpu_uuid_to_name[gpu_rp_uuid] = pci_device gpu_inventory = self.placement_api.get( '/resource_providers/%s/inventories' % gpu_rp_uuid).body['inventories'] self.assertEqual(1, gpu_inventory['VGPU']['total']) gpu_usages = self.placement_api.get( '/resource_providers/%s/usages' % gpu_rp_uuid).body['usages'] usages[pci_device] = gpu_usages['VGPU'] # Make sure that both instances are using different pGPUs used_devices = [dev for dev, usage in usages.items() if usage == 1] avail_devices = list(set(usages.keys()) - set(used_devices)) self.assertEqual(2, len(used_devices)) # Make sure that both instances are using the correct pGPUs for server in [server1, server2]: allocations = self.placement_api.get( '/allocations/%s' % server['id']).body['allocations'] self.assertEqual({ 'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2 }, allocations[compute_rp_uuid]['resources']) rp_uuids = list(allocations.keys()) # We only have two RPs, the compute RP (the root) and the child # pGPU RP gpu_rp_uuid = (rp_uuids[1] if rp_uuids[0] == compute_rp_uuid else rp_uuids[0]) self.assertEqual({'VGPU': 1}, allocations[gpu_rp_uuid]['resources']) # The pGPU's RP name contains the pGPU name self.assertIn(inst_to_pgpu[server['id']], pgpu_uuid_to_name[gpu_rp_uuid]) # now create one more instance with vgpu against the reshaped tree created_server = self.api.post_server({'server': server_req}) server3 = self._wait_for_state_change(created_server, 'ACTIVE') # find the pGPU that wasn't used before we created the third instance # It should have taken the previously available pGPU device = avail_devices[0] gpu_rp_uuid = self.placement_api.get( '/resource_providers?name=compute1_%s' % device).body['resource_providers'][0]['uuid'] gpu_usages = self.placement_api.get('/resource_providers/%s/usages' % gpu_rp_uuid).body['usages'] self.assertEqual(1, gpu_usages['VGPU']) allocations = self.placement_api.get('/allocations/%s' % server3['id']).body['allocations'] self.assertEqual({ 'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2 }, allocations[compute_rp_uuid]['resources']) self.assertEqual({'VGPU': 1}, allocations[gpu_rp_uuid]['resources'])
def test_create_servers_with_vgpu(self): """Verify that vgpu reshape works with libvirt driver 1) create two servers with an old tree where the VGPU resource is on the compute provider 2) trigger a reshape 3) check that the allocations of the servers are still valid 4) create another server now against the new tree """ # NOTE(gibi): We cannot simply ask the virt driver to create an old # RP tree with vgpu on the root RP as that code path does not exist # any more. So we have to hack a "bit". We will create a compute # service without vgpu support to have the compute RP ready then we # manually add the VGPU resources to that RP in placement. Also we make # sure that during the instance claim the virt driver does not detect # the old tree as that would be a bad time for reshape. Later when the # compute service is restarted the driver will do the reshape. fake_connection = self._get_connection( # We need more RAM or the 3rd server won't be created host_info=fakelibvirt.HostInfo(kB_mem=8192), libvirt_version=self.MIN_LIBVIRT_MDEV_SUPPORT, mdev_info=fakelibvirt.HostMdevDevicesInfo()) self.mock_conn.return_value = fake_connection # start a compute with vgpu support disabled so the driver will # ignore the content of the above HostMdevDeviceInfo self.flags(enabled_vgpu_types='', group='devices') self.compute = self.start_service('compute', host='compute1') # create the VGPU resource in placement manually compute_rp_uuid = self.placement_api.get( '/resource_providers?name=compute1').body[ 'resource_providers'][0]['uuid'] inventories = self.placement_api.get( '/resource_providers/%s/inventories' % compute_rp_uuid).body inventories['inventories']['VGPU'] = { 'allocation_ratio': 1.0, 'max_unit': 3, 'min_unit': 1, 'reserved': 0, 'step_size': 1, 'total': 3} self.placement_api.put( '/resource_providers/%s/inventories' % compute_rp_uuid, inventories) # now we boot two servers with vgpu extra_spec = {"resources:VGPU": 1} flavor_id = self._create_flavor(extra_spec=extra_spec) server_req = self._build_server(flavor_id) # NOTE(gibi): during instance_claim() there is a # driver.update_provider_tree() call that would detect the old tree and # would fail as this is not a good time to reshape. To avoid that we # temporarily mock update_provider_tree here. with mock.patch('nova.virt.libvirt.driver.LibvirtDriver.' 'update_provider_tree'): created_server1 = self.api.post_server({'server': server_req}) server1 = self._wait_for_state_change(created_server1, 'ACTIVE') created_server2 = self.api.post_server({'server': server_req}) server2 = self._wait_for_state_change(created_server2, 'ACTIVE') # Determine which device is associated with which instance # { inst.uuid: pgpu_name } inst_to_pgpu = {} ctx = context.get_admin_context() for server in (server1, server2): inst = objects.Instance.get_by_uuid(ctx, server['id']) mdevs = list( self.compute.driver._get_all_assigned_mediated_devices(inst)) self.assertEqual(1, len(mdevs)) mdev_uuid = mdevs[0] mdev_info = self.compute.driver._get_mediated_device_information( utils.mdev_uuid2name(mdev_uuid)) inst_to_pgpu[inst.uuid] = mdev_info['parent'] # The VGPUs should have come from different pGPUs self.assertNotEqual(*list(inst_to_pgpu.values())) # verify that the inventory, usages and allocation are correct before # the reshape compute_inventory = self.placement_api.get( '/resource_providers/%s/inventories' % compute_rp_uuid).body[ 'inventories'] self.assertEqual(3, compute_inventory['VGPU']['total']) compute_usages = self.placement_api.get( '/resource_providers/%s/usages' % compute_rp_uuid).body[ 'usages'] self.assertEqual(2, compute_usages['VGPU']) for server in (server1, server2): allocations = self.placement_api.get( '/allocations/%s' % server['id']).body['allocations'] # the flavor has disk=10 and ephemeral=10 self.assertEqual( {'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2, 'VGPU': 1}, allocations[compute_rp_uuid]['resources']) # enabled vgpu support self.flags( enabled_vgpu_types=fakelibvirt.NVIDIA_11_VGPU_TYPE, group='devices') # restart compute which will trigger a reshape self.restart_compute_service(self.compute) # verify that the inventory, usages and allocation are correct after # the reshape compute_inventory = self.placement_api.get( '/resource_providers/%s/inventories' % compute_rp_uuid).body[ 'inventories'] self.assertNotIn('VGPU', compute_inventory) # NOTE(sbauza): The two instances will use two different pGPUs # That said, we need to check all the pGPU inventories for knowing # which ones are used. usages = {} pgpu_uuid_to_name = {} for pci_device in [fakelibvirt.PGPU1_PCI_ADDR, fakelibvirt.PGPU2_PCI_ADDR, fakelibvirt.PGPU3_PCI_ADDR]: gpu_rp_uuid = self.placement_api.get( '/resource_providers?name=compute1_%s' % pci_device).body[ 'resource_providers'][0]['uuid'] pgpu_uuid_to_name[gpu_rp_uuid] = pci_device gpu_inventory = self.placement_api.get( '/resource_providers/%s/inventories' % gpu_rp_uuid).body[ 'inventories'] self.assertEqual(1, gpu_inventory['VGPU']['total']) gpu_usages = self.placement_api.get( '/resource_providers/%s/usages' % gpu_rp_uuid).body[ 'usages'] usages[pci_device] = gpu_usages['VGPU'] # Make sure that both instances are using different pGPUs used_devices = [dev for dev, usage in usages.items() if usage == 1] avail_devices = list(set(usages.keys()) - set(used_devices)) self.assertEqual(2, len(used_devices)) # Make sure that both instances are using the correct pGPUs for server in [server1, server2]: allocations = self.placement_api.get( '/allocations/%s' % server['id']).body[ 'allocations'] self.assertEqual( {'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2}, allocations[compute_rp_uuid]['resources']) rp_uuids = list(allocations.keys()) # We only have two RPs, the compute RP (the root) and the child # pGPU RP gpu_rp_uuid = (rp_uuids[1] if rp_uuids[0] == compute_rp_uuid else rp_uuids[0]) self.assertEqual( {'VGPU': 1}, allocations[gpu_rp_uuid]['resources']) # The pGPU's RP name contains the pGPU name self.assertIn(inst_to_pgpu[server['id']], pgpu_uuid_to_name[gpu_rp_uuid]) # now create one more instance with vgpu against the reshaped tree created_server = self.api.post_server({'server': server_req}) server3 = self._wait_for_state_change(created_server, 'ACTIVE') # find the pGPU that wasn't used before we created the third instance # It should have taken the previously available pGPU device = avail_devices[0] gpu_rp_uuid = self.placement_api.get( '/resource_providers?name=compute1_%s' % device).body[ 'resource_providers'][0]['uuid'] gpu_usages = self.placement_api.get( '/resource_providers/%s/usages' % gpu_rp_uuid).body[ 'usages'] self.assertEqual(1, gpu_usages['VGPU']) allocations = self.placement_api.get( '/allocations/%s' % server3['id']).body[ 'allocations'] self.assertEqual( {'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2}, allocations[compute_rp_uuid]['resources']) self.assertEqual( {'VGPU': 1}, allocations[gpu_rp_uuid]['resources'])