def test_allocate(self): regions = self.gpu.regions # More than one region self.assertGreater(len(regions), 0) # Find kernel argument regions kernarg_regions = list() for r in regions: if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG): kernarg_regions.append(r) self.assertGreater(len(kernarg_regions), 0) # Test allocating at the kernel argument region kernarg_region = kernarg_regions[0] nelem = 10 ptr = kernarg_region.allocate(ctypes.sizeof(ctypes.c_float) * nelem) self.assertNotEqual(ctypes.addressof(ptr), 0, "pointer must not be NULL") # Test writing to it src = np.random.random(nelem).astype(np.float32) ctypes.memmove(ptr, src.ctypes.data, src.nbytes) ref = (ctypes.c_float * nelem).from_address(ptr.value) for i in range(src.size): self.assertEqual(ref[i], src[i]) roc.hsa_memory_free(ptr)
def test_coarse_grained_kernel_execution(self): """ This tests the execution of a kernel on a dGPU using coarse memory regions for the buffers. NOTE: the code violates the HSA spec in that it uses a coarse region for kernargs, this is a performance hack. """ from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\ Executable # get a brig file brig_file = get_brig_file() brig_module = BrigModule.from_file(brig_file) self.assertGreater(len(brig_module), 0) # use existing GPU regions for computation space gpu_regions = self.gpu.regions gpu_only_coarse_regions = list() gpu_host_accessible_coarse_regions = list() for r in gpu_regions: if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED): if r.host_accessible: gpu_host_accessible_coarse_regions.append(r) else: gpu_only_coarse_regions.append(r) # check we have 1+ coarse gpu region(s) of each type self.assertGreater(len(gpu_only_coarse_regions), 0) self.assertGreater(len(gpu_host_accessible_coarse_regions), 0) # Compilation phase: # FIXME: this is dubious, assume launching agent is indexed first agent = roc.components[0] prog = Program() prog.add_module(brig_module) # get kernel and load code = prog.finalize(agent.isa) ex = Executable() ex.load(agent, code) ex.freeze() # extract symbols sym = ex.get_symbol(agent, "&__vector_copy_kernel") self.assertNotEqual(sym.kernel_object, 0) self.assertGreater(sym.kernarg_segment_size, 0) # attempt kernel excution import ctypes import numpy as np # Do memory allocations # allocate and initialise memory nelem = 1024 * 1024 src = np.random.random(nelem).astype(np.float32) z0 = np.zeros_like(src) # alloc host accessible memory nbytes = ctypes.sizeof(ctypes.c_float) * nelem gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0] host_in_ptr = gpu_host_accessible_region.allocate(nbytes) self.assertNotEqual(host_in_ptr.value, None, "pointer must not be NULL") host_out_ptr = gpu_host_accessible_region.allocate(nbytes) self.assertNotEqual(host_out_ptr.value, None, "pointer must not be NULL") # init mem with data roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes) roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes) # alloc gpu only memory gpu_only_region = gpu_only_coarse_regions[0] gpu_in_ptr = gpu_only_region.allocate(nbytes) self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL") gpu_out_ptr = gpu_only_region.allocate(nbytes) self.assertNotEqual(gpu_out_ptr.value, None, "pointer must not be NULL") # copy memory from host accessible location to gpu only roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes) # Do kernargs # Find a coarse region (for better performance on dGPU) in which # to place kernargs. NOTE: This violates the HSA spec kernarg_regions = list() for r in gpu_host_accessible_coarse_regions: # NOTE: VIOLATION if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG): kernarg_regions.append(r) self.assertGreater(len(kernarg_regions), 0) # use first region for args kernarg_region = kernarg_regions[0] kernarg_ptr = kernarg_region.allocate( 2 * ctypes.sizeof(ctypes.c_void_p)) self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL") # wire in gpu memory argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value) argref[0] = gpu_in_ptr.value argref[1] = gpu_out_ptr.value # signal sig = roc.create_signal(1) # create queue and dispatch job queue = agent.create_queue_single(32) queue.dispatch(sym, kernarg_ptr, workgroup_size=(256, 1, 1), grid_size=(nelem, 1, 1),signal=None) # copy result back to host accessible memory to check roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes) # check the data is recovered ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value) np.testing.assert_equal(ref, src) # free roc.hsa_memory_free(host_in_ptr) roc.hsa_memory_free(host_out_ptr) roc.hsa_memory_free(gpu_in_ptr) roc.hsa_memory_free(gpu_out_ptr)
def test_coarse_grained_allocate(self): """ Tests the coarse grained allocation works on a dGPU. It performs a data copying round trip via: memory | HSA cpu memory | HSA dGPU host accessible memory <---| | | HSA dGPU memory --------------------| """ gpu_regions = self.gpu.regions gpu_only_coarse_regions = list() gpu_host_accessible_coarse_regions = list() for r in gpu_regions: if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED): if r.host_accessible: gpu_host_accessible_coarse_regions.append(r) else: gpu_only_coarse_regions.append(r) # check we have 1+ coarse gpu region(s) of each type self.assertGreater(len(gpu_only_coarse_regions), 0) self.assertGreater(len(gpu_host_accessible_coarse_regions), 0) cpu_regions = self.cpu.regions cpu_coarse_regions = list() for r in cpu_regions: if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED): cpu_coarse_regions.append(r) # check we have 1+ coarse cpu region(s) self.assertGreater(len(cpu_coarse_regions), 0) # ten elements of data used nelem = 10 # allocation cpu_region = cpu_coarse_regions[0] cpu_ptr = cpu_region.allocate(ctypes.sizeof(ctypes.c_float) * nelem) self.assertNotEqual(ctypes.addressof(cpu_ptr), 0, "pointer must not be NULL") gpu_only_region = gpu_only_coarse_regions[0] gpu_only_ptr = gpu_only_region.allocate(ctypes.sizeof(ctypes.c_float) * nelem) self.assertNotEqual(ctypes.addressof(gpu_only_ptr), 0, "pointer must not be NULL") gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0] gpu_host_accessible_ptr = gpu_host_accessible_region.allocate( ctypes.sizeof(ctypes.c_float) * nelem) self.assertNotEqual(ctypes.addressof(gpu_host_accessible_ptr), 0, "pointer must not be NULL") # Test writing to allocated area src = np.random.random(nelem).astype(np.float32) roc.hsa_memory_copy(cpu_ptr, src.ctypes.data, src.nbytes) roc.hsa_memory_copy(gpu_host_accessible_ptr, cpu_ptr, src.nbytes) roc.hsa_memory_copy(gpu_only_ptr, gpu_host_accessible_ptr, src.nbytes) # check write is correct cpu_ref = (ctypes.c_float * nelem).from_address(cpu_ptr.value) for i in range(src.size): self.assertEqual(cpu_ref[i], src[i]) gpu_ha_ref = (ctypes.c_float * nelem).\ from_address(gpu_host_accessible_ptr.value) for i in range(src.size): self.assertEqual(gpu_ha_ref[i], src[i]) # zero out host accessible GPU memory and CPU memory z0 = np.zeros(nelem).astype(np.float32) roc.hsa_memory_copy(cpu_ptr, z0.ctypes.data, z0.nbytes) roc.hsa_memory_copy(gpu_host_accessible_ptr, cpu_ptr, z0.nbytes) # check zeroing is correct for i in range(z0.size): self.assertEqual(cpu_ref[i], z0[i]) for i in range(z0.size): self.assertEqual(gpu_ha_ref[i], z0[i]) # copy back the data from the GPU roc.hsa_memory_copy(gpu_host_accessible_ptr, gpu_only_ptr, src.nbytes) # check the copy back is ok for i in range(src.size): self.assertEqual(gpu_ha_ref[i], src[i]) # free roc.hsa_memory_free(cpu_ptr) roc.hsa_memory_free(gpu_only_ptr) roc.hsa_memory_free(gpu_host_accessible_ptr)
def test_coarse_grained_kernel_execution(self): """ This tests the execution of a kernel on a dGPU using coarse memory regions for the buffers. NOTE: the code violates the HSA spec in that it uses a coarse region for kernargs, this is a performance hack. """ from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\ Executable # get a brig file brig_file = get_brig_file() brig_module = BrigModule.from_file(brig_file) self.assertGreater(len(brig_module), 0) # use existing GPU regions for computation space gpu_regions = self.gpu.regions gpu_only_coarse_regions = list() gpu_host_accessible_coarse_regions = list() for r in gpu_regions: if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED): if r.host_accessible: gpu_host_accessible_coarse_regions.append(r) else: gpu_only_coarse_regions.append(r) # check we have 1+ coarse gpu region(s) of each type self.assertGreater(len(gpu_only_coarse_regions), 0) self.assertGreater(len(gpu_host_accessible_coarse_regions), 0) # Compilation phase: # FIXME: this is dubious, assume launching agent is indexed first agent = roc.components[0] prog = Program() prog.add_module(brig_module) # get kernel and load code = prog.finalize(agent.isa) ex = Executable() ex.load(agent, code) ex.freeze() # extract symbols sym = ex.get_symbol(agent, "&__vector_copy_kernel") self.assertNotEqual(sym.kernel_object, 0) self.assertGreater(sym.kernarg_segment_size, 0) # attempt kernel excution import ctypes import numpy as np # Do memory allocations # allocate and initialise memory nelem = 1024 * 1024 src = np.random.random(nelem).astype(np.float32) z0 = np.zeros_like(src) # alloc host accessible memory nbytes = ctypes.sizeof(ctypes.c_float) * nelem gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0] host_in_ptr = gpu_host_accessible_region.allocate(nbytes) self.assertNotEqual(host_in_ptr.value, None, "pointer must not be NULL") host_out_ptr = gpu_host_accessible_region.allocate(nbytes) self.assertNotEqual(host_out_ptr.value, None, "pointer must not be NULL") # init mem with data roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes) roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes) # alloc gpu only memory gpu_only_region = gpu_only_coarse_regions[0] gpu_in_ptr = gpu_only_region.allocate(nbytes) self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL") gpu_out_ptr = gpu_only_region.allocate(nbytes) self.assertNotEqual(gpu_out_ptr.value, None, "pointer must not be NULL") # copy memory from host accessible location to gpu only roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes) # Do kernargs # Find a coarse region (for better performance on dGPU) in which # to place kernargs. NOTE: This violates the HSA spec kernarg_regions = list() for r in gpu_host_accessible_coarse_regions: # NOTE: VIOLATION if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG): kernarg_regions.append(r) self.assertGreater(len(kernarg_regions), 0) # use first region for args kernarg_region = kernarg_regions[0] kernarg_ptr = kernarg_region.allocate(2 * ctypes.sizeof(ctypes.c_void_p)) self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL") # wire in gpu memory argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value) argref[0] = gpu_in_ptr.value argref[1] = gpu_out_ptr.value # signal sig = roc.create_signal(1) # create queue and dispatch job queue = agent.create_queue_single(32) queue.dispatch(sym, kernarg_ptr, workgroup_size=(256, 1, 1), grid_size=(nelem, 1, 1), signal=None) # copy result back to host accessible memory to check roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes) # check the data is recovered ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value) np.testing.assert_equal(ref, src) # free roc.hsa_memory_free(host_in_ptr) roc.hsa_memory_free(host_out_ptr) roc.hsa_memory_free(gpu_in_ptr) roc.hsa_memory_free(gpu_out_ptr)
def test_coarse_grained_allocate(self): """ Tests the coarse grained allocation works on a dGPU. It performs a data copying round trip via: memory | HSA cpu memory | HSA dGPU host accessible memory <---| | | HSA dGPU memory --------------------| """ gpu_regions = self.gpu.regions gpu_only_coarse_regions = list() gpu_host_accessible_coarse_regions = list() for r in gpu_regions: if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED): if r.host_accessible: gpu_host_accessible_coarse_regions.append(r) else: gpu_only_coarse_regions.append(r) # check we have 1+ coarse gpu region(s) of each type self.assertGreater(len(gpu_only_coarse_regions), 0) self.assertGreater(len(gpu_host_accessible_coarse_regions), 0) cpu_regions = self.cpu.regions cpu_coarse_regions = list() for r in cpu_regions: if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED): cpu_coarse_regions.append(r) # check we have 1+ coarse cpu region(s) self.assertGreater(len(cpu_coarse_regions), 0) # ten elements of data used nelem = 10 # allocation cpu_region = cpu_coarse_regions[0] cpu_ptr = cpu_region.allocate(ctypes.sizeof(ctypes.c_float) * nelem) self.assertNotEqual(ctypes.addressof(cpu_ptr), 0, "pointer must not be NULL") gpu_only_region = gpu_only_coarse_regions[0] gpu_only_ptr = gpu_only_region.allocate( ctypes.sizeof(ctypes.c_float) * nelem) self.assertNotEqual(ctypes.addressof(gpu_only_ptr), 0, "pointer must not be NULL") gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0] gpu_host_accessible_ptr = gpu_host_accessible_region.allocate( ctypes.sizeof(ctypes.c_float) * nelem) self.assertNotEqual(ctypes.addressof(gpu_host_accessible_ptr), 0, "pointer must not be NULL") # Test writing to allocated area src = np.random.random(nelem).astype(np.float32) roc.hsa_memory_copy(cpu_ptr, src.ctypes.data, src.nbytes) roc.hsa_memory_copy(gpu_host_accessible_ptr, cpu_ptr, src.nbytes) roc.hsa_memory_copy(gpu_only_ptr, gpu_host_accessible_ptr, src.nbytes) # check write is correct cpu_ref = (ctypes.c_float * nelem).from_address(cpu_ptr.value) for i in range(src.size): self.assertEqual(cpu_ref[i], src[i]) gpu_ha_ref = (ctypes.c_float * nelem).\ from_address(gpu_host_accessible_ptr.value) for i in range(src.size): self.assertEqual(gpu_ha_ref[i], src[i]) # zero out host accessible GPU memory and CPU memory z0 = np.zeros(nelem).astype(np.float32) roc.hsa_memory_copy(cpu_ptr, z0.ctypes.data, z0.nbytes) roc.hsa_memory_copy(gpu_host_accessible_ptr, cpu_ptr, z0.nbytes) # check zeroing is correct for i in range(z0.size): self.assertEqual(cpu_ref[i], z0[i]) for i in range(z0.size): self.assertEqual(gpu_ha_ref[i], z0[i]) # copy back the data from the GPU roc.hsa_memory_copy(gpu_host_accessible_ptr, gpu_only_ptr, src.nbytes) # check the copy back is ok for i in range(src.size): self.assertEqual(gpu_ha_ref[i], src[i]) # free roc.hsa_memory_free(cpu_ptr) roc.hsa_memory_free(gpu_only_ptr) roc.hsa_memory_free(gpu_host_accessible_ptr)