def test_realign_numpy_array(self): import numpy a = numpy.empty(1000, dtype=numpy.float32) a = cl.realign_array(a, 1056, numpy) self.assertEqual(a.__array_interface__["data"][0] % 1056, 0) a = numpy.empty(1024, dtype=numpy.float32) a = cl.realign_array(a, 4096, numpy) self.assertEqual(a.__array_interface__["data"][0] % 4096, 0)
def ocl_realign_mem(self): """We are using CL_MEM_USE_HOST_PTR, so memory should be PAGE-aligned. """ if isinstance(self.device, NumpyDevice) or \ self.device.device_info.memalign <= 4096: memalign = 4096 else: memalign = self.device.device_info.memalign self.mem = cl.realign_array(self._mem, memalign, numpy)
def test_realign_numpy_array(self): try: import numpy except ImportError: # for pypy try: import numpypy as numpy except ImportError: raise ImportError("Could not import numpy") a = numpy.empty(1000, dtype=numpy.float32) a = cl.realign_array(a, 1056, numpy) self.assertEqual(a.__array_interface__["data"][0] % 1056, 0)
def test(): print(os.environ.get("PYOPENCL_CTX")) os.environ["PYOPENCL_CTX"] = "0:0" # Create platform, context, program, kernel and queue platforms = cl.Platforms() print("OpenCL devices:\n%s" % platforms.dump_devices()) ctx = platforms.create_some_context() queue = ctx.create_queue(ctx.devices[0], cl.CL_QUEUE_PROFILING_ENABLE) ''' prg = ctx.create_program( """ __kernel void test( __global float *a, __global float *b, const float c) { size_t i = get_global_id(0); a[i] = (a[i] + b[i]) * c; } """) ''' prg = ctx.create_program(testopencl.readoclfile("test.cl")) krn = prg.get_kernel("test") # Create arrays with some values for testing a = np.arange(100000, dtype=np.float32) b = np.cos(a) a = np.sin(a) a_copy = a.copy() # Prepare arrays for use with map_buffer a = cl.realign_array(a, queue.device.memalign, np) b = cl.realign_array(b, queue.device.memalign, np) c = np.array([0.1], dtype=np.float32) d = (a + b) * c[0] # Create buffers a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, a) b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, b) # Set kernel arguments krn.set_args(a_, b_, c[0:1]) # Execute kernel global_size = [a.size] local_size = None queue.execute_kernel(krn, global_size, local_size, need_event=False) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes) del ev queue.unmap_buffer(a_, ptr).wait() print(a - d) aa = np.zeros(a.shape, dtype=a.dtype) queue.read_buffer(a_, aa) print(aa - d) # Refill buffer with stored copy by write_buffer ev = queue.write_buffer(a_, a_copy, blocking=False, need_event=True) # Execute kernel ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev, )) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes, wait_for=(ev, ), need_event=True) ev.wait() queue.unmap_buffer(a_, ptr).wait() print(a - d) bb = np.zeros(a.shape, dtype=a.dtype) queue.read_buffer(a_, bb) print(bb - d) del queue del ctx del krn del prg gc.collect()
def cuda_realign_mem(self): # We expect numpy array with continuous memory layout, so realign it. # PAGE-boundary alignment may increase speed also. self.mem = cl.realign_array(self._mem, 4096, numpy)
def test_create_sub_buffer(self): import numpy # Create platform, context, program, kernel and queue platforms = cl.Platforms() ctx = platforms.create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) krn = prg.get_kernel("test") queue = ctx.create_queue(ctx.devices[0]) # Create arrays with some values for testing a = numpy.arange(100000, dtype=numpy.float32) b = numpy.cos(a) a = numpy.sin(a) # Prepare arrays for use with map_buffer a = cl.realign_array(a, queue.device.memalign, numpy) b = cl.realign_array(b, queue.device.memalign, numpy) c = numpy.array([1.2345], dtype=numpy.float32) d = a[1024:1024 + 4096] + b[2048:2048 + 4096] * c[0] # Create buffers a_parent_ = ctx.create_buffer( cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, a) self.assertEqual(a_parent_._n_refs, 1) a_ = a_parent_.create_sub_buffer(4096, 16384) self.assertEqual(a_parent_._n_refs, 2) self.assertEqual(a_._n_refs, 1) b_parent_ = ctx.create_buffer( cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, b) self.assertEqual(b_parent_._n_refs, 1) b_ = b_parent_.create_sub_buffer(8192, 16384) self.assertEqual(b_parent_._n_refs, 2) self.assertEqual(b_._n_refs, 1) # Set kernel arguments krn.set_args(a_, b_, c[0:1]) # Execute kernel global_size = [4096] local_size = None queue.execute_kernel(krn, global_size, local_size, need_event=False) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a_.size) del ev queue.unmap_buffer(a_, ptr).wait() self.assertLess( numpy.fabs(a[1024:1024 + 4096] - d).max(), 0.0001, "Incorrect result after map_buffer") # Get results back from the device by read_buffer aa = numpy.zeros(4096, dtype=numpy.float32) queue.read_buffer(a_, aa) self.assertLess( numpy.fabs(aa - d).max(), 0.0001, "Incorrect result after read_buffer") del b_ self.assertIn(b_parent_._n_refs, (1, 2)) logging.info( "test_create_sub_buffer: " "b_parent_._n_refs = %d (expected 1 or 2)", b_parent_._n_refs) del a_ self.assertIn(a_parent_._n_refs, (1, 2)) logging.info( "test_create_sub_buffer: " "a_parent_._n_refs = %d (expected 1 or 2)", a_parent_._n_refs)
def test_api_numpy(self): import numpy # Create platform, context, program, kernel and queue platforms = cl.Platforms() ctx = platforms.create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) krn = prg.get_kernel("test") queue = ctx.create_queue(ctx.devices[0]) # Create arrays with some values for testing a = numpy.arange(100000, dtype=numpy.float32) b = numpy.cos(a) a = numpy.sin(a) a_copy = a.copy() # Prepare arrays for use with map_buffer a = cl.realign_array(a, queue.device.memalign, numpy) b = cl.realign_array(b, queue.device.memalign, numpy) c = numpy.array([1.2345], dtype=numpy.float32) d = a + b * c[0] # Create buffers a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, a) b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, b) # Set kernel arguments krn.set_args(a_, b_, c[0:1]) # Execute kernel global_size = [a.size] local_size = None queue.execute_kernel(krn, global_size, local_size, need_event=False) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes) del ev queue.unmap_buffer(a_, ptr).wait() self.assertLess( numpy.fabs(a - d).max(), 0.0001, "Incorrect result after map_buffer") # Get results back from the device by read_buffer aa = numpy.zeros(a.shape, dtype=a.dtype) queue.read_buffer(a_, aa) self.assertLess( numpy.fabs(aa - d).max(), 0.0001, "Incorrect result after read_buffer") # Refill buffer with stored copy by map_buffer with event ev, ptr = queue.map_buffer( a_, cl.CL_MAP_WRITE if queue.device.version < 1.1999 else cl.CL_MAP_WRITE_INVALIDATE_REGION, a.nbytes, blocking=False, need_event=True) ev.wait() a[:] = a_copy[:] ev = queue.unmap_buffer(a_, ptr) # Execute kernel ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev, )) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes, wait_for=(ev, ), need_event=True) ev.wait() queue.unmap_buffer(a_, ptr).wait() self.assertLess( numpy.fabs(a - d).max(), 0.0001, "Incorrect result after map_buffer") # Refill buffer with stored copy by write_buffer ev = queue.write_buffer(a_, a_copy, blocking=False, need_event=True) # Execute kernel ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev, )) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes, wait_for=(ev, ), need_event=True) ev.wait() queue.unmap_buffer(a_, ptr).wait() self.assertLess( numpy.fabs(a - d).max(), 0.0001, "Incorrect result after map_buffer")
def test_create_sub_buffer(self): import numpy # Create platform, context, program, kernel and queue platforms = cl.Platforms() ctx = platforms.create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) krn = prg.get_kernel("test") queue = ctx.create_queue(ctx.devices[0]) # Create arrays with some values for testing a = numpy.arange(100000, dtype=numpy.float32) b = numpy.cos(a) a = numpy.sin(a) # Prepare arrays for use with map_buffer a = cl.realign_array(a, queue.device.memalign, numpy) b = cl.realign_array(b, queue.device.memalign, numpy) c = numpy.array([1.2345], dtype=numpy.float32) d = a[1024:1024 + 4096] + b[2048:2048 + 4096] * c[0] # Create buffers a_parent_ = ctx.create_buffer( cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, a) self.assertEqual(a_parent_._n_refs, 1) a_ = a_parent_.create_sub_buffer(4096, 16384) self.assertEqual(a_parent_._n_refs, 2) self.assertEqual(a_._n_refs, 1) b_parent_ = ctx.create_buffer( cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, b) self.assertEqual(b_parent_._n_refs, 1) b_ = b_parent_.create_sub_buffer(8192, 16384) self.assertEqual(b_parent_._n_refs, 2) self.assertEqual(b_._n_refs, 1) # Set kernel arguments krn.set_args(a_, b_, c[0:1]) # Execute kernel global_size = [4096] local_size = None queue.execute_kernel(krn, global_size, local_size, need_event=False) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a_.size) del ev queue.unmap_buffer(a_, ptr).wait() self.assertLess(numpy.fabs(a[1024:1024 + 4096] - d).max(), 0.0001, "Incorrect result after map_buffer") # Get results back from the device by read_buffer aa = numpy.zeros(4096, dtype=numpy.float32) queue.read_buffer(a_, aa) self.assertLess(numpy.fabs(aa - d).max(), 0.0001, "Incorrect result after read_buffer") del b_ self.assertIn(b_parent_._n_refs, (1, 2)) logging.info("test_create_sub_buffer: " "b_parent_._n_refs = %d (expected 1 or 2)", b_parent_._n_refs) del a_ self.assertIn(a_parent_._n_refs, (1, 2)) logging.info("test_create_sub_buffer: " "a_parent_._n_refs = %d (expected 1 or 2)", a_parent_._n_refs)
def test_api_numpy(self): import numpy # Create platform, context, program, kernel and queue platforms = cl.Platforms() ctx = platforms.create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) krn = prg.get_kernel("test") queue = ctx.create_queue(ctx.devices[0]) # Create arrays with some values for testing a = numpy.arange(100000, dtype=numpy.float32) b = numpy.cos(a) a = numpy.sin(a) a_copy = a.copy() # Prepare arrays for use with map_buffer a = cl.realign_array(a, queue.device.memalign, numpy) b = cl.realign_array(b, queue.device.memalign, numpy) c = numpy.array([1.2345], dtype=numpy.float32) d = a + b * c[0] # Create buffers a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, a) b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, b) # Set kernel arguments krn.set_args(a_, b_, c[0:1]) # Execute kernel global_size = [a.size] local_size = None queue.execute_kernel(krn, global_size, local_size, need_event=False) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes) del ev queue.unmap_buffer(a_, ptr).wait() self.assertLess(numpy.fabs(a - d).max(), 0.0001, "Incorrect result after map_buffer") # Get results back from the device by read_buffer aa = numpy.zeros(a.shape, dtype=a.dtype) queue.read_buffer(a_, aa) self.assertLess(numpy.fabs(aa - d).max(), 0.0001, "Incorrect result after read_buffer") # Refill buffer with stored copy by map_buffer with event ev, ptr = queue.map_buffer( a_, cl.CL_MAP_WRITE if queue.device.version < 1.1999 else cl.CL_MAP_WRITE_INVALIDATE_REGION, a.nbytes, blocking=False, need_event=True) ev.wait() a[:] = a_copy[:] ev = queue.unmap_buffer(a_, ptr) # Execute kernel ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev,)) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes, wait_for=(ev,), need_event=True) ev.wait() queue.unmap_buffer(a_, ptr).wait() self.assertLess(numpy.fabs(a - d).max(), 0.0001, "Incorrect result after map_buffer") # Refill buffer with stored copy by write_buffer ev = queue.write_buffer(a_, a_copy, blocking=False, need_event=True) # Execute kernel ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev,)) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes, wait_for=(ev,), need_event=True) ev.wait() queue.unmap_buffer(a_, ptr).wait() self.assertLess(numpy.fabs(a - d).max(), 0.0001, "Incorrect result after map_buffer")