Exemplos de hsa_memory_copy em Python, exemplos de numba.roc.hsadrv.driver.hsa.hsa_memory_copy em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_driver.py Projeto: esc/numba

    def test_mempool(self):
        n = 10 # things to alloc
        nbytes = ctypes.sizeof(ctypes.c_double) * n

        dGPU_agent = self.gpu
        CPU_agent = self.cpu

        # allocate a GPU memory pool
        gpu_ctx = Context(dGPU_agent)
        gpu_only_mem = gpu_ctx.mempoolalloc(nbytes)

        # allocate a CPU memory pool, allow the GPU access to it
        cpu_ctx = Context(CPU_agent)
        cpu_mem = cpu_ctx.mempoolalloc(nbytes, allow_access_to=[gpu_ctx.agent])

        ## Test writing to allocated area
        src = np.random.random(n).astype(np.float64)
        roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data, src.nbytes)
        roc.hsa_memory_copy(gpu_only_mem.device_pointer, cpu_mem.device_pointer, src.nbytes)


        # clear
        z0 = np.zeros_like(src)
        roc.hsa_memory_copy(cpu_mem.device_pointer, z0.ctypes.data, z0.nbytes)
        ref = (n * ctypes.c_double).from_address(cpu_mem.device_pointer.value)
        for k in range(n):
            self.assertEqual(ref[k], 0)

        # copy back from dGPU
        roc.hsa_memory_copy(cpu_mem.device_pointer, gpu_only_mem.device_pointer, src.nbytes)
        for k in range(n):
            self.assertEqual(ref[k], src[k])

Exemplo n.º 2

0

Exibir arquivo

    def test_mempool(self):
        n = 10  # things to alloc
        nbytes = ctypes.sizeof(ctypes.c_double) * n

        dGPU_agent = self.gpu
        CPU_agent = self.cpu

        # allocate a GPU memory pool
        gpu_ctx = Context(dGPU_agent)
        gpu_only_mem = gpu_ctx.mempoolalloc(nbytes)

        # allocate a CPU memory pool, allow the GPU access to it
        cpu_ctx = Context(CPU_agent)
        cpu_mem = cpu_ctx.mempoolalloc(nbytes, allow_access_to=[gpu_ctx.agent])

        ## Test writing to allocated area
        src = np.random.random(n).astype(np.float64)
        roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data,
                            src.nbytes)
        roc.hsa_memory_copy(gpu_only_mem.device_pointer,
                            cpu_mem.device_pointer, src.nbytes)

        # clear
        z0 = np.zeros_like(src)
        roc.hsa_memory_copy(cpu_mem.device_pointer, z0.ctypes.data, z0.nbytes)
        ref = (n * ctypes.c_double).from_address(cpu_mem.device_pointer.value)
        for k in range(n):
            self.assertEqual(ref[k], 0)

        # copy back from dGPU
        roc.hsa_memory_copy(cpu_mem.device_pointer,
                            gpu_only_mem.device_pointer, src.nbytes)
        for k in range(n):
            self.assertEqual(ref[k], src[k])

Exemplo n.º 3

0

Exibir arquivo

    def test_memalloc(self):
        """
            Tests Context.memalloc() for a given, in the parlance of HSA,\
            `component`. Testing includes specialisations for the supported
            components of dGPUs and APUs.
        """
        n = 10  # things to alloc
        nbytes = ctypes.sizeof(ctypes.c_double) * n

        # run if a dGPU is present
        if dgpu_present:
            # find a host accessible region
            dGPU_agent = self.gpu
            CPU_agent = self.cpu
            gpu_ctx = Context(dGPU_agent)
            gpu_only_mem = gpu_ctx.memalloc(nbytes, hostAccessible=False)
            ha_mem = gpu_ctx.memalloc(nbytes, hostAccessible=True)

            # on dGPU systems, all host mem is host accessible
            cpu_ctx = Context(CPU_agent)
            cpu_mem = cpu_ctx.memalloc(nbytes, hostAccessible=True)

            # Test writing to allocated area
            src = np.random.random(n).astype(np.float64)
            roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data,
                                src.nbytes)
            roc.hsa_memory_copy(ha_mem.device_pointer, cpu_mem.device_pointer,
                                src.nbytes)
            roc.hsa_memory_copy(gpu_only_mem.device_pointer,
                                ha_mem.device_pointer, src.nbytes)

            # clear
            z0 = np.zeros_like(src)
            roc.hsa_memory_copy(ha_mem.device_pointer, z0.ctypes.data,
                                z0.nbytes)
            ref = (n * ctypes.c_double).from_address(
                ha_mem.device_pointer.value)
            for k in range(n):
                self.assertEqual(ref[k], 0)

            # copy back from dGPU
            roc.hsa_memory_copy(ha_mem.device_pointer,
                                gpu_only_mem.device_pointer, src.nbytes)
            for k in range(n):
                self.assertEqual(ref[k], src[k])

        else:  #TODO: write APU variant
            pass

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test_driver.py Projeto: esc/numba

    def test_memalloc(self):
        """
            Tests Context.memalloc() for a given, in the parlance of HSA,\
            `component`. Testing includes specialisations for the supported
            components of dGPUs and APUs.
        """
        n = 10 # things to alloc
        nbytes = ctypes.sizeof(ctypes.c_double) * n

        # run if a dGPU is present
        if dgpu_present:
            # find a host accessible region
            dGPU_agent = self.gpu
            CPU_agent = self.cpu
            gpu_ctx = Context(dGPU_agent)
            gpu_only_mem = gpu_ctx.memalloc(nbytes, hostAccessible=False)
            ha_mem = gpu_ctx.memalloc(nbytes, hostAccessible=True)

            # on dGPU systems, all host mem is host accessible
            cpu_ctx = Context(CPU_agent)
            cpu_mem = cpu_ctx.memalloc(nbytes, hostAccessible=True)

            # Test writing to allocated area
            src = np.random.random(n).astype(np.float64)
            roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data, src.nbytes)
            roc.hsa_memory_copy(ha_mem.device_pointer, cpu_mem.device_pointer, src.nbytes)
            roc.hsa_memory_copy(gpu_only_mem.device_pointer, ha_mem.device_pointer, src.nbytes)

            # clear
            z0 = np.zeros_like(src)
            roc.hsa_memory_copy(ha_mem.device_pointer, z0.ctypes.data, z0.nbytes)
            ref = (n * ctypes.c_double).from_address(ha_mem.device_pointer.value)
            for k in range(n):
                self.assertEqual(ref[k], 0)

            # copy back from dGPU
            roc.hsa_memory_copy(ha_mem.device_pointer, gpu_only_mem.device_pointer, src.nbytes)
            for k in range(n):
                self.assertEqual(ref[k], src[k])

        else: #TODO: write APU variant
            pass

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test_driver.py Projeto: esc/numba

    def test_coarse_grained_kernel_execution(self):
        """
        This tests the execution of a kernel on a dGPU using coarse memory
        regions for the buffers.
        NOTE: the code violates the HSA spec in that it uses a coarse region
        for kernargs, this is a performance hack.
        """

        from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\
                Executable

        # get a brig file
        brig_file = get_brig_file()
        brig_module = BrigModule.from_file(brig_file)
        self.assertGreater(len(brig_module), 0)

        # use existing GPU regions for computation space
        gpu_regions = self.gpu.regions
        gpu_only_coarse_regions = list()
        gpu_host_accessible_coarse_regions = list()
        for r in gpu_regions:
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
                if r.host_accessible:
                    gpu_host_accessible_coarse_regions.append(r)
                else:
                    gpu_only_coarse_regions.append(r)

        # check we have 1+ coarse gpu region(s) of each type
        self.assertGreater(len(gpu_only_coarse_regions), 0)
        self.assertGreater(len(gpu_host_accessible_coarse_regions), 0)

        # Compilation phase:

        # FIXME: this is dubious, assume launching agent is indexed first
        agent = roc.components[0]

        prog = Program()
        prog.add_module(brig_module)

        # get kernel and load
        code = prog.finalize(agent.isa)

        ex = Executable()
        ex.load(agent, code)
        ex.freeze()

        # extract symbols
        sym = ex.get_symbol(agent, "&__vector_copy_kernel")
        self.assertNotEqual(sym.kernel_object, 0)
        self.assertGreater(sym.kernarg_segment_size, 0)

        # attempt kernel excution
        import ctypes
        import numpy as np

        # Do memory allocations

        # allocate and initialise memory
        nelem = 1024 * 1024

        src = np.random.random(nelem).astype(np.float32)
        z0 = np.zeros_like(src)

        # alloc host accessible memory
        nbytes = ctypes.sizeof(ctypes.c_float) * nelem
        gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0]
        host_in_ptr = gpu_host_accessible_region.allocate(nbytes)
        self.assertNotEqual(host_in_ptr.value, None,
                "pointer must not be NULL")
        host_out_ptr = gpu_host_accessible_region.allocate(nbytes)
        self.assertNotEqual(host_out_ptr.value, None,
                "pointer must not be NULL")

        # init mem with data
        roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes)
        roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes)

        # alloc gpu only memory
        gpu_only_region = gpu_only_coarse_regions[0]
        gpu_in_ptr = gpu_only_region.allocate(nbytes)
        self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL")
        gpu_out_ptr = gpu_only_region.allocate(nbytes)
        self.assertNotEqual(gpu_out_ptr.value, None,
            "pointer must not be NULL")

        # copy memory from host accessible location to gpu only
        roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes)

        # Do kernargs

        # Find a coarse region (for better performance on dGPU) in which
        # to place kernargs. NOTE: This violates the HSA spec
        kernarg_regions = list()
        for r in gpu_host_accessible_coarse_regions:
           # NOTE: VIOLATION
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG):
                kernarg_regions.append(r)
        self.assertGreater(len(kernarg_regions), 0)

        # use first region for args
        kernarg_region = kernarg_regions[0]

        kernarg_ptr = kernarg_region.allocate(
                2 * ctypes.sizeof(ctypes.c_void_p))

        self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL")

        # wire in gpu memory
        argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value)
        argref[0] = gpu_in_ptr.value
        argref[1] = gpu_out_ptr.value

        # signal
        sig = roc.create_signal(1)

        # create queue and dispatch job

        queue = agent.create_queue_single(32)
        queue.dispatch(sym, kernarg_ptr, workgroup_size=(256, 1, 1),
                           grid_size=(nelem, 1, 1),signal=None)

        # copy result back to host accessible memory to check
        roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes)

        # check the data is recovered
        ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value)
        np.testing.assert_equal(ref, src)

        # free
        roc.hsa_memory_free(host_in_ptr)
        roc.hsa_memory_free(host_out_ptr)
        roc.hsa_memory_free(gpu_in_ptr)
        roc.hsa_memory_free(gpu_out_ptr)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test_driver.py Projeto: esc/numba

    def test_coarse_grained_allocate(self):
        """
        Tests the coarse grained allocation works on a dGPU.
        It performs a data copying round trip via:
        memory
          |
        HSA cpu memory
          |
        HSA dGPU host accessible memory <---|
          |                                 |
        HSA dGPU memory --------------------|
        """
        gpu_regions = self.gpu.regions
        gpu_only_coarse_regions = list()
        gpu_host_accessible_coarse_regions = list()
        for r in gpu_regions:
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
                if r.host_accessible:
                    gpu_host_accessible_coarse_regions.append(r)
                else:
                    gpu_only_coarse_regions.append(r)

        # check we have 1+ coarse gpu region(s) of each type
        self.assertGreater(len(gpu_only_coarse_regions), 0)
        self.assertGreater(len(gpu_host_accessible_coarse_regions), 0)

        cpu_regions = self.cpu.regions
        cpu_coarse_regions = list()
        for r in cpu_regions:
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
                cpu_coarse_regions.append(r)
        # check we have 1+ coarse cpu region(s)
        self.assertGreater(len(cpu_coarse_regions), 0)

        # ten elements of data used
        nelem = 10

        # allocation
        cpu_region = cpu_coarse_regions[0]
        cpu_ptr = cpu_region.allocate(ctypes.sizeof(ctypes.c_float) * nelem)
        self.assertNotEqual(ctypes.addressof(cpu_ptr), 0,
                "pointer must not be NULL")

        gpu_only_region = gpu_only_coarse_regions[0]
        gpu_only_ptr = gpu_only_region.allocate(ctypes.sizeof(ctypes.c_float) *
                nelem)
        self.assertNotEqual(ctypes.addressof(gpu_only_ptr), 0,
                "pointer must not be NULL")

        gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0]
        gpu_host_accessible_ptr = gpu_host_accessible_region.allocate(
                ctypes.sizeof(ctypes.c_float) * nelem)
        self.assertNotEqual(ctypes.addressof(gpu_host_accessible_ptr), 0,
                "pointer must not be NULL")

        # Test writing to allocated area
        src = np.random.random(nelem).astype(np.float32)
        roc.hsa_memory_copy(cpu_ptr, src.ctypes.data, src.nbytes)
        roc.hsa_memory_copy(gpu_host_accessible_ptr, cpu_ptr, src.nbytes)
        roc.hsa_memory_copy(gpu_only_ptr, gpu_host_accessible_ptr, src.nbytes)

        # check write is correct
        cpu_ref = (ctypes.c_float * nelem).from_address(cpu_ptr.value)
        for i in range(src.size):
            self.assertEqual(cpu_ref[i], src[i])

        gpu_ha_ref = (ctypes.c_float * nelem).\
            from_address(gpu_host_accessible_ptr.value)
        for i in range(src.size):
            self.assertEqual(gpu_ha_ref[i], src[i])

        # zero out host accessible GPU memory and CPU memory
        z0 = np.zeros(nelem).astype(np.float32)
        roc.hsa_memory_copy(cpu_ptr, z0.ctypes.data, z0.nbytes)
        roc.hsa_memory_copy(gpu_host_accessible_ptr, cpu_ptr, z0.nbytes)

        # check zeroing is correct
        for i in range(z0.size):
            self.assertEqual(cpu_ref[i], z0[i])

        for i in range(z0.size):
            self.assertEqual(gpu_ha_ref[i], z0[i])

        # copy back the data from the GPU
        roc.hsa_memory_copy(gpu_host_accessible_ptr, gpu_only_ptr, src.nbytes)

        # check the copy back is ok
        for i in range(src.size):
            self.assertEqual(gpu_ha_ref[i], src[i])

        # free
        roc.hsa_memory_free(cpu_ptr)
        roc.hsa_memory_free(gpu_only_ptr)
        roc.hsa_memory_free(gpu_host_accessible_ptr)

Exemplo n.º 7

0

Exibir arquivo

    def test_coarse_grained_kernel_execution(self):
        """
        This tests the execution of a kernel on a dGPU using coarse memory
        regions for the buffers.
        NOTE: the code violates the HSA spec in that it uses a coarse region
        for kernargs, this is a performance hack.
        """

        from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\
                Executable

        # get a brig file
        brig_file = get_brig_file()
        brig_module = BrigModule.from_file(brig_file)
        self.assertGreater(len(brig_module), 0)

        # use existing GPU regions for computation space
        gpu_regions = self.gpu.regions
        gpu_only_coarse_regions = list()
        gpu_host_accessible_coarse_regions = list()
        for r in gpu_regions:
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
                if r.host_accessible:
                    gpu_host_accessible_coarse_regions.append(r)
                else:
                    gpu_only_coarse_regions.append(r)

        # check we have 1+ coarse gpu region(s) of each type
        self.assertGreater(len(gpu_only_coarse_regions), 0)
        self.assertGreater(len(gpu_host_accessible_coarse_regions), 0)

        # Compilation phase:

        # FIXME: this is dubious, assume launching agent is indexed first
        agent = roc.components[0]

        prog = Program()
        prog.add_module(brig_module)

        # get kernel and load
        code = prog.finalize(agent.isa)

        ex = Executable()
        ex.load(agent, code)
        ex.freeze()

        # extract symbols
        sym = ex.get_symbol(agent, "&__vector_copy_kernel")
        self.assertNotEqual(sym.kernel_object, 0)
        self.assertGreater(sym.kernarg_segment_size, 0)

        # attempt kernel excution
        import ctypes
        import numpy as np

        # Do memory allocations

        # allocate and initialise memory
        nelem = 1024 * 1024

        src = np.random.random(nelem).astype(np.float32)
        z0 = np.zeros_like(src)

        # alloc host accessible memory
        nbytes = ctypes.sizeof(ctypes.c_float) * nelem
        gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0]
        host_in_ptr = gpu_host_accessible_region.allocate(nbytes)
        self.assertNotEqual(host_in_ptr.value, None,
                            "pointer must not be NULL")
        host_out_ptr = gpu_host_accessible_region.allocate(nbytes)
        self.assertNotEqual(host_out_ptr.value, None,
                            "pointer must not be NULL")

        # init mem with data
        roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes)
        roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes)

        # alloc gpu only memory
        gpu_only_region = gpu_only_coarse_regions[0]
        gpu_in_ptr = gpu_only_region.allocate(nbytes)
        self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL")
        gpu_out_ptr = gpu_only_region.allocate(nbytes)
        self.assertNotEqual(gpu_out_ptr.value, None,
                            "pointer must not be NULL")

        # copy memory from host accessible location to gpu only
        roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes)

        # Do kernargs

        # Find a coarse region (for better performance on dGPU) in which
        # to place kernargs. NOTE: This violates the HSA spec
        kernarg_regions = list()
        for r in gpu_host_accessible_coarse_regions:
            # NOTE: VIOLATION
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG):
                kernarg_regions.append(r)
        self.assertGreater(len(kernarg_regions), 0)

        # use first region for args
        kernarg_region = kernarg_regions[0]

        kernarg_ptr = kernarg_region.allocate(2 *
                                              ctypes.sizeof(ctypes.c_void_p))

        self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL")

        # wire in gpu memory
        argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value)
        argref[0] = gpu_in_ptr.value
        argref[1] = gpu_out_ptr.value

        # signal
        sig = roc.create_signal(1)

        # create queue and dispatch job

        queue = agent.create_queue_single(32)
        queue.dispatch(sym,
                       kernarg_ptr,
                       workgroup_size=(256, 1, 1),
                       grid_size=(nelem, 1, 1),
                       signal=None)

        # copy result back to host accessible memory to check
        roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes)

        # check the data is recovered
        ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value)
        np.testing.assert_equal(ref, src)

        # free
        roc.hsa_memory_free(host_in_ptr)
        roc.hsa_memory_free(host_out_ptr)
        roc.hsa_memory_free(gpu_in_ptr)
        roc.hsa_memory_free(gpu_out_ptr)

Exemplo n.º 8

0

Exibir arquivo

    def test_coarse_grained_allocate(self):
        """
        Tests the coarse grained allocation works on a dGPU.
        It performs a data copying round trip via:
        memory
          |
        HSA cpu memory
          |
        HSA dGPU host accessible memory <---|
          |                                 |
        HSA dGPU memory --------------------|
        """
        gpu_regions = self.gpu.regions
        gpu_only_coarse_regions = list()
        gpu_host_accessible_coarse_regions = list()
        for r in gpu_regions:
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
                if r.host_accessible:
                    gpu_host_accessible_coarse_regions.append(r)
                else:
                    gpu_only_coarse_regions.append(r)

        # check we have 1+ coarse gpu region(s) of each type
        self.assertGreater(len(gpu_only_coarse_regions), 0)
        self.assertGreater(len(gpu_host_accessible_coarse_regions), 0)

        cpu_regions = self.cpu.regions
        cpu_coarse_regions = list()
        for r in cpu_regions:
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
                cpu_coarse_regions.append(r)
        # check we have 1+ coarse cpu region(s)
        self.assertGreater(len(cpu_coarse_regions), 0)

        # ten elements of data used
        nelem = 10

        # allocation
        cpu_region = cpu_coarse_regions[0]
        cpu_ptr = cpu_region.allocate(ctypes.sizeof(ctypes.c_float) * nelem)
        self.assertNotEqual(ctypes.addressof(cpu_ptr), 0,
                            "pointer must not be NULL")

        gpu_only_region = gpu_only_coarse_regions[0]
        gpu_only_ptr = gpu_only_region.allocate(
            ctypes.sizeof(ctypes.c_float) * nelem)
        self.assertNotEqual(ctypes.addressof(gpu_only_ptr), 0,
                            "pointer must not be NULL")

        gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0]
        gpu_host_accessible_ptr = gpu_host_accessible_region.allocate(
            ctypes.sizeof(ctypes.c_float) * nelem)
        self.assertNotEqual(ctypes.addressof(gpu_host_accessible_ptr), 0,
                            "pointer must not be NULL")

        # Test writing to allocated area
        src = np.random.random(nelem).astype(np.float32)
        roc.hsa_memory_copy(cpu_ptr, src.ctypes.data, src.nbytes)
        roc.hsa_memory_copy(gpu_host_accessible_ptr, cpu_ptr, src.nbytes)
        roc.hsa_memory_copy(gpu_only_ptr, gpu_host_accessible_ptr, src.nbytes)

        # check write is correct
        cpu_ref = (ctypes.c_float * nelem).from_address(cpu_ptr.value)
        for i in range(src.size):
            self.assertEqual(cpu_ref[i], src[i])

        gpu_ha_ref = (ctypes.c_float * nelem).\
            from_address(gpu_host_accessible_ptr.value)
        for i in range(src.size):
            self.assertEqual(gpu_ha_ref[i], src[i])

        # zero out host accessible GPU memory and CPU memory
        z0 = np.zeros(nelem).astype(np.float32)
        roc.hsa_memory_copy(cpu_ptr, z0.ctypes.data, z0.nbytes)
        roc.hsa_memory_copy(gpu_host_accessible_ptr, cpu_ptr, z0.nbytes)

        # check zeroing is correct
        for i in range(z0.size):
            self.assertEqual(cpu_ref[i], z0[i])

        for i in range(z0.size):
            self.assertEqual(gpu_ha_ref[i], z0[i])

        # copy back the data from the GPU
        roc.hsa_memory_copy(gpu_host_accessible_ptr, gpu_only_ptr, src.nbytes)

        # check the copy back is ok
        for i in range(src.size):
            self.assertEqual(gpu_ha_ref[i], src[i])

        # free
        roc.hsa_memory_free(cpu_ptr)
        roc.hsa_memory_free(gpu_only_ptr)
        roc.hsa_memory_free(gpu_host_accessible_ptr)