def typeof(ctx, obj): if isinstance(obj, cl.MemoryObject): return cl.global_memory(obj.ctype, ndim=len(obj.shape), shape=obj.shape, context=ctx) elif isinstance(obj, cl.local_memory): return obj elif isfunction(obj): return obj elif isinstance(obj, int): return ctypes.c_int elif isinstance(obj, float): return ctypes.c_float elif isinstance(obj, ctypes.Structure): return cl.constant_memory(type(obj), 0, (), context=ctx) # raise NotImplementedError("ctypes.Structure as parameter") else: try: view = memoryview(obj) return cl.global_memory(view.format, ndim=len(view.shape), shape=view.shape, context=ctx) except TypeError: pass return type(obj)
def setslice(context, arr, value): if not isinstance(value, cl.DeviceMemoryView): value = context.asarray(value) if value.queue != arr.queue: arr.queue.enqueue_wait_for_events(value.queue.marker()) value = cl.broadcast(value, arr.shape) kernel = setslice_kernel.compile(context, arr=cl.global_memory(arr.format, flat=True), value=cl.global_memory(value.format, flat=True), cly_meta='setslice') return kernel(arr.queue, arr, arr.array_info, value, value.array_info)
def create_grid(cls, nx=500, ny=500): ca = CLArrayContext(device_type=cls.DEVICE_TYPE) g = Grid(ca, nx, ny) dx2, dy2 = g.dx ** 2, g.dy ** 2 dnr_inv = 0.5 / (dx2 + dy2) #self.ctx = cl.create_some_context() g.prg = cl.Program(ca, """ __kernel void lp2dstep( __global float *u, const uint stidx ) { int i = get_global_id(0) + 1; int ny = %d; for ( int j = 1 + ( ( i + stidx ) %% 2 ); j<( %d-1 ); j+=2 ) { u[ny*j + i] = ((u[ny*(j-1) + i] + u[ny*(j+1) + i])*%g + (u[ny*j + i-1] + u[ny*j + i + 1])*%g)*%g; } }""" % (ny, ny, dy2, dx2, dnr_inv)) g.prg.build() g.lp2dstep = g.prg.lp2dstep g.lp2dstep.argnames = 'u', 'stidx' g.lp2dstep.argtypes = cl.global_memory(ctype='f'), cl.cl_uint g.lp2dstep.global_work_size = [nx - 2] g.queue = cl.Queue(ca) return g
def test_call(self): expected = np.zeros([10], dtype=[('x', np.float32), ('y', np.float32)]) expected['x'] = np.arange(10) expected['y'] = np.sin(expected['x'] / 10) program = Program(ctx, source=source) program.build() generate_sin = program.kernel('generate_sin') generate_sin.argtypes = [global_memory(), ctypes.c_float] buf = empty(ctx, [10], ctype=cl.cl_float2) queue = Queue(ctx, ctx.devices[0]) size = [buf.size] with self.assertRaises(TypeError): generate_sin(queue, buf, 1.0) generate_sin(queue, buf, 1.0, global_work_size=size) with buf.map(queue) as host: self.assertTrue(np.all(expected['x'] == np.asarray(host)[:, 0])) self.assertTrue(np.allclose(expected['y'], np.asarray(host)[:, 1])) generate_sin.global_work_size = lambda a, scale: [a.size] generate_sin(queue, buf, 1.0) with buf.map(queue) as host: self.assertTrue(np.all(expected['x'] == np.asarray(host)[:, 0])) self.assertTrue(np.allclose(expected['y'], np.asarray(host)[:, 1]))
def __call__(self, context, x, y, out=None, queue=None): if queue is None: if hasattr(x, 'queue'): queue = x.queue elif hasattr(y, 'queue'): queue = y.queue else: queue = context.queue if not isinstance(x, cl.DeviceMemoryView): x = context.asarray(x) if not isinstance(y, cl.DeviceMemoryView): y = context.asarray(y) if y.queue != queue: queue.enqueue_wait_for_events(y.queue.marker()) if x.queue != queue: queue.enqueue_wait_for_events(x.queue.marker()) new_shape = broadcast_shape(x.shape, y.shape) a = cl.broadcast(x, new_shape) b = cl.broadcast(y, new_shape) if out is None: out = context.empty(shape=new_shape, ctype=x.format, queue=queue) # kernel_source = ufunc_kernel._compile(queue.context, function=self.device_func, # a=cl.global_memory(a.format, flat=True), # b=cl.global_memory(b.format, flat=True), # out=cl.global_memory(out.format, flat=True), source_only=True) kernel = ufunc_kernel.compile(context, function=self.device_func, a=cl.global_memory(a.format, flat=True), b=cl.global_memory(b.format, flat=True), out=cl.global_memory(out.format, flat=True), cly_meta=self.device_func.func_name) kernel(queue, a, a.array_info, b, b.array_info, out, out.array_info) array = CLArray._view_as_this(out) array.__array_init__(context, queue) return array
def __call__(self, context, x, y, out=None, queue=None): if queue is None: if hasattr(x,'queue'): queue = x.queue elif hasattr(y,'queue'): queue = y.queue else: queue = context.queue if not isinstance(x, cl.DeviceMemoryView): x = context.asarray(x) if not isinstance(y, cl.DeviceMemoryView): y = context.asarray(y) if y.queue != queue: queue.enqueue_wait_for_events(y.queue.marker()) if x.queue != queue: queue.enqueue_wait_for_events(x.queue.marker()) new_shape = broadcast_shape(x.shape, y.shape) a = cl.broadcast(x, new_shape) b = cl.broadcast(y, new_shape) if out is None: out = context.empty(shape=new_shape, ctype=x.format, queue=queue) # kernel_source = ufunc_kernel._compile(queue.context, function=self.device_func, # a=cl.global_memory(a.format, flat=True), # b=cl.global_memory(b.format, flat=True), # out=cl.global_memory(out.format, flat=True), source_only=True) kernel = ufunc_kernel.compile(context, function=self.device_func, a=cl.global_memory(a.format, flat=True), b=cl.global_memory(b.format, flat=True), out=cl.global_memory(out.format, flat=True), cly_meta=self.device_func.func_name) kernel(queue, a, a.array_info, b, b.array_info, out, out.array_info) array = CLArray._view_as_this(out) array.__array_init__(context, queue) return array
def reduce(self, context, x, out=None, initial=0.0, queue=None): if queue is None: queue = x.queue if not isinstance(x, cl.DeviceMemoryView): x = cl.from_host(queue.context, x) #output, input, shared, group_size, initial=0.0 size = x.size shared = cl.local_memory(x.ctype, ndim=1, shape=[size]) group_size = size // 2 for item in [2, 4, 8, 16, 32, 64, 128, 256, 512]: if group_size < item: group_size = item // 2 break else: group_size = 512 if out is None: out = cl.empty(queue.context, [1], x.format) kernel = reduce_kernel.compile(queue.context, function=self.device_func, output=cl.global_memory(out.ctype, flat=True), array=cl.global_memory(x.ctype, flat=True), shared=shared, group_size=cl.cl_uint, cly_meta=self.device_func.func_name) max_wgsize = kernel.work_group_size(queue.device) group_size = min(max_wgsize, group_size) kernel(queue, out, out.array_info, x, x.array_info, shared, shared.local_info, group_size) # reduce_kernel(queue, self.device_func, out, x, shared, group_size) # reduce_kernel(queue, self.device_func, out, x, shared, group_size) array = CLArray._view_as_this(out) array.__array_init__(context, queue) return array
def reduce(self, context, x, out=None, initial=0.0, queue=None): if queue is None: queue = x.queue if not isinstance(x, cl.DeviceMemoryView): x = cl.from_host(queue.context, x) #output, input, shared, group_size, initial=0.0 size = x.size shared = cl.local_memory(x.ctype, ndim=1, shape=[size]) group_size = size // 2 for item in [2, 4, 8, 16, 32, 64, 128, 256, 512]: if group_size < item: group_size = item // 2 break else: group_size = 512 if out is None: out = cl.empty(queue.context, [1], x.format) kernel = reduce_kernel.compile(queue.context, function=self.device_func, output=cl.global_memory(out.ctype, flat=True), array=cl.global_memory(x.ctype, flat=True), shared=shared, group_size=cl.cl_uint, cly_meta=self.device_func.func_name) max_wgsize = kernel.work_group_size(queue.device) group_size = min(max_wgsize, group_size) kernel(queue, out, out.array_info, x, x.array_info, shared, shared.local_info, group_size) # reduce_kernel(queue, self.device_func, out, x, shared, group_size) # reduce_kernel(queue, self.device_func, out, x, shared, group_size) array = CLArray._view_as_this(out) array.__array_init__(context, queue) return array
def main(): size = 10 a = np.random.rand(size).astype('f') b = np.random.rand(size).astype('f') ctx = cl.Context() queue = cl.Queue(ctx) cla = cl.from_host(ctx, a, copy=True) clb = cl.from_host(ctx, b, copy=True) clc = cl.empty(ctx, [size], ctype='f') prg = cl.Program( ctx, """ __kernel void add(__global const float *a, __global const float *b, __global float *c) { int gid = get_global_id(0); c[gid] = a[gid] + b[gid]; } """).build() add = prg.add add.argtypes = cl.global_memory('f'), cl.global_memory( 'f'), cl.global_memory('f') add.argnames = 'a', 'b', 'c' add.global_work_size = lambda a: a.shape add(queue, a=cla, b=clb, c=clc) with clc.map(queue) as view: print "view is a python memoryview object", view arr = np.asarray(view) print "Answer should be zero:" print(arr - (a + b)).sum()
def typeof(ctx, obj): if isinstance(obj, cl.MemoryObject): return cl.global_memory(obj.ctype, ndim=len(obj.shape), shape=obj.shape, context=ctx) elif isinstance(obj, cl.local_memory): return obj elif isfunction(obj): return obj elif isinstance(obj, int): return ctypes.c_int elif isinstance(obj, float): return ctypes.c_float elif isinstance(obj, ctypes.Structure): return cl.constant_memory(type(obj), 0, (), context=ctx) # raise NotImplementedError("ctypes.Structure as parameter") else: try: view = memoryview(obj) return cl.global_memory(view.format, ndim=len(view.shape), shape=view.shape, context=ctx) except TypeError: pass return type(obj)
def main(): ctx = cl.Context(device_type=cl.Device.GPU) ret = cl.empty(ctx, [16], "l") queue = cl.Queue(ctx) print setslice.compile(ctx, a=cl.global_memory("l"), value=c_int, source_only=True) # print setslice(queue, ret[::2], c_int(6)) # print setslice(queue, ret[1::2], c_int(5)) with ret.map(queue) as foo: print np.asarray(foo)
def main(): size = 10 a = np.random.rand(size).astype('f') b = np.random.rand(size).astype('f') ctx = cl.Context() queue = cl.Queue(ctx) cla = cl.from_host(ctx, a, copy=True) clb = cl.from_host(ctx, b, copy=True) clc = cl.empty(ctx, [size], ctype='f') prg = cl.Program(ctx, """ __kernel void add(__global const float *a, __global const float *b, __global float *c) { int gid = get_global_id(0); c[gid] = a[gid] + b[gid]; } """).build() add = prg.add add.argtypes = cl.global_memory('f'), cl.global_memory('f'), cl.global_memory('f') add.argnames = 'a', 'b', 'c' add.global_work_size = lambda a: a.shape add(queue, a=cla, b=clb, c=clc) with clc.map(queue) as view: print "view is a python memoryview object", view arr = np.asarray(view) print "Answer should be zero:" print (arr - (a + b)).sum()
def main(): ctx = cl.Context(device_type=cl.Device.GPU) ret = cl.empty(ctx, [16], 'l') queue = cl.Queue(ctx) print setslice.compile(ctx, a=cl.global_memory('l'), value=c_int, source_only=True) # print setslice(queue, ret[::2], c_int(6)) # print setslice(queue, ret[1::2], c_int(5)) with ret.map(queue) as foo: print np.asarray(foo)
def test_set_args(self): program = Program(ctx, source=source) program.build() generate_sin = program.kernel('generate_sin') generate_sin.argtypes = [global_memory(), ctypes.c_float] buf = empty(ctx, [10], ctype=cl.cl_float2) queue = Queue(ctx, ctx.devices[0]) generate_sin.set_args(buf, 1.0) queue.enqueue_nd_range_kernel(generate_sin, 1, global_work_size=[buf.size]) expected = np.zeros([10], dtype=[('x', np.float32), ('y', np.float32)]) expected['x'] = np.arange(10) expected['y'] = np.sin(expected['x'] / 10) with buf.map(queue) as host: self.assertTrue(np.all(expected['x'] == np.asarray(host)[:, 0])) self.assertTrue(np.allclose(expected['y'], np.asarray(host)[:, 1])) generate_sin.argnames = ['a', 'scale'] generate_sin.set_args(a=buf, scale=1.0) queue.enqueue_nd_range_kernel(generate_sin, 1, global_work_size=[buf.size]) with buf.map(queue) as host: self.assertTrue(np.all(expected['x'] == np.asarray(host)[:, 0])) self.assertTrue(np.allclose(expected['y'], np.asarray(host)[:, 1])) with self.assertRaises(TypeError): generate_sin.set_args(a=buf) generate_sin.__defaults__ = [1.0] generate_sin.set_args(a=buf) queue.enqueue_nd_range_kernel(generate_sin, 1, global_work_size=[buf.size]) with buf.map(queue) as host: self.assertTrue(np.all(expected['x'] == np.asarray(host)[:, 0])) self.assertTrue(np.allclose(expected['y'], np.asarray(host)[:, 1]))
def initialize(): global generate_sin, coords_dev, n_vertices ctx = cl.gl.context() if generate_sin is None: program = cl.Program(ctx, generate_sin_source).build() generate_sin = program.generate_sin generate_sin.argnames = 'a', generate_sin.argtypes = cl.global_memory(cl.cl_float2), generate_sin.global_work_size = lambda a: a.shape coords_dev = cl.gl.empty_gl(ctx, [n_vertices], ctype=cl.cl_float2) glClearColor(1, 1, 1, 1) glColor(0, 0, 1) queue = cl.Queue(ctx) with cl.gl.acquire(queue, coords_dev): generate_sin(queue, coords_dev) glEnableClientState(GL_VERTEX_ARRAY)
def initialize(): global generate_sin, coords_dev, n_vertices ctx = cl.gl.context() if generate_sin is None: program = cl.Program(ctx, generate_sin_source).build() generate_sin = program.generate_sin generate_sin.argnames = 'a', generate_sin.argtypes = cl.global_memory(cl.cl_float2), generate_sin.global_work_size = lambda a: a.shape coords_dev = cl.gl.empty_gl(ctx, [n_vertices], ctype=cl.cl_float2) glClearColor(1, 1, 1, 1) glColor(0, 0, 1) queue = cl.Queue(ctx) with cl.gl.acquire(queue, coords_dev): generate_sin(queue, coords_dev) glEnableClientState(GL_VERTEX_ARRAY)
import clyther as cly import opencl as cl import clyther.runtime as clrt @cly.global_work_size(lambda a: a.shape) @cly.kernel def foo(a): x = clrt.get_global_id(0) y = clrt.get_global_id(1) a[x, y] = x + y * 100 ctx = cl.Context(device_type=cl.Device.CPU) queue = cl.Queue(ctx) a = cl.empty(ctx, [4, 4], 'f') foo(queue, a) print foo._compile(ctx, a=cl.global_memory('f'), source_only=True) import numpy as np with a.map(queue) as view: print np.asarray(view)
import clyther as cly import opencl as cl import clyther.runtime as clrt @cly.global_work_size(lambda a: a.shape) @cly.kernel def foo(a): x = clrt.get_global_id(0) y = clrt.get_global_id(1) a[x, y] = x + y * 100 ctx = cl.Context(device_type=cl.Device.CPU) queue = cl.Queue(ctx) a = cl.empty(ctx, [4, 4], 'f') foo(queue, a) print foo._compile(ctx, a=cl.global_memory('f'), source_only=True) import numpy as np with a.map(queue) as view: print np.asarray(view)
#Always have to create a context. ctx = cl.Context() @cly.global_work_size(lambda a: [a.size]) @cly.kernel def generate_sin(a): gid = clrt.get_global_id(0) n = clrt.get_global_size(0) r = cl.cl_float(gid) / cl.cl_float(n) # sin wave with 8 peaks y = r * cl.cl_float(16.0 * 3.1415) # x is a range from -1 to 1 a[gid].x = r * 2.0 - 1.0 # y is sin wave a[gid].y = clrt.native_sin(y) #=============================================================================== # Compile to openCL code #=============================================================================== print generate_sin.compile(ctx, a=cl.global_memory(cl.cl_float2), source_only=True)
with a.map(queue) as view: print np.asarray(view) #=============================================================================== # From here I can keep boiling down until I get the the bare openCL C framework #=============================================================================== #=============================================================================== # Plotting #=============================================================================== from maka import roo ctx = roo.start() queue = cl.Queue(ctx) a = cl.gl.empty_gl(ctx, [200], cly.float2) event = generate_sin(queue, a) event.wait() roo.plot(a) roo.show() #=============================================================================== # Compile to openCL code #=============================================================================== print generate_sin.compile(ctx, a=cl.global_memory('f'), source_only=True)