def get_elwise_program(context, arguments, operation, name="elwise_kernel", options=[], preamble="", loop_prep="", after_loop=""): from pyopencl import Program source = (""" %(preamble)s __kernel void %(name)s(%(arguments)s) { unsigned lid = get_local_id(0); unsigned gsize = get_global_size(0); unsigned work_item_start = get_local_size(0)*get_group_id(0); unsigned i; %(loop_prep)s; for (i = work_item_start + lid; i < n; i += gsize) { %(operation)s; } %(after_loop)s; } """ % { "arguments": ", ".join(arg.declarator() for arg in arguments), "operation": operation, "name": name, "preamble": preamble, "loop_prep": loop_prep, "after_loop": after_loop, }) return Program(context, source).build(options)
def main(): devices = get_devices() try: debug(CL_DEVICE_TYPE(devices[0].type)) except IndexError as ie: exception(ie) context = Context(devices) queue = CommandQueue(context) # Create queue for each kernel execution source = read_kernel_source("median_filter.cl") program = Program(context, source).build() # Kernel function instantiation image = imread('../data/noisyImage.jpg', flatten=True).astype(float32) # Read in image imshow(image) start_usec = perf_counter() args = allocate_variables(context, image) program.medianFilter(queue, image.shape, None, *args) # Call Kernel. # Automatically takes care of block/grid distribution. Note explicit naming of kernel to execute. result = copy_from_buffer(queue, args[1], image.shape, image.dtype) # Copy the result back from buffer debug("%g milliseconds" % (1e3 * (perf_counter() - start_usec))) imshow(result) imsave('../data/medianFilter-OpenCL.jpg', result) # Show the blurred image
def init_grid_methods(self): self.init_generic_methods() self.set_global_working_group_size() grid_sources = [] grid_sources.append(''.join(open(src_path+"grid_generic.cl")\ .readlines()) ) grid_sources.append(''.join( \ open(src_path+"grid_deposit_m"+str(self.Args['M'])+".cl")\ .readlines() ) ) grid_sources = self.block_def_str + ''.join(grid_sources) prg = Program(self.ctx, grid_sources).\ build(options=compiler_options) self._divide_by_dv_d_knl = prg.divide_by_dv_d self._divide_by_dv_c_knl = prg.divide_by_dv_c self._treat_axis_d_knl = prg.treat_axis_d self._treat_axis_c_knl = prg.treat_axis_c self._warp_axis_m0_d_knl = prg.warp_axis_m0_d self._warp_axis_m1plus_c_knl = prg.warp_axis_m1plus_c self._depose_scalar_knl = prg.depose_scalar self._depose_vector_knl = prg.depose_vector self._gather_and_push_knl = prg.gather_and_push if 'vec_comps' not in self.Args: self.Args['vec_comps'] = self.Args['default_vec_comps']
def __init__(self): self.h = self.w = 800 self.dx = self.dy = 3. / 800. self.x0 = -2 self.y0 = 1.5 ctx = create_some_context() fmt = ImageFormat(CHO.RGBA, CHANNEL.UNSIGNED_INT8) self.buf = Image(ctx, MEM.WRITE_ONLY, fmt, shape=(self.w, self.h)) prg = Program(ctx, load_cl_text("mandelbrot.cl")).build() self.params = (self.w, self.h, ctx, self.buf, prg)
def init_solver_methods(self): solver_sources = [] solver_sources.append(''.join(open(src_path+"solver_ms_pic.cl")\ .readlines()) ) solver_sources = self.block_def_str + ''.join(solver_sources) prg = Program(self.ctx, solver_sources).\ build(options=compiler_options) self._advance_e_g_m_knl = prg.advance_e_g_m self._profile_edges_c_knl = prg.profile_edges_c self._profile_edges_d_knl = prg.profile_edges_d if 'DampCells' in self.Args: self._init_field_damping()
def init_generic_methods(self): self.set_global_working_group_size() generic_sources = [] generic_sources.append(''.join(open(src_path+"generic.cl")\ .readlines()) ) generic_sources = self.block_def_str + ''.join(generic_sources) prg = Program(self.ctx, generic_sources).\ build(options=compiler_options) self._cast_array_d2c_knl = prg.cast_array_d2c self._axpbyz_c2c_knl = prg.axpbyz_c2c self._zpaxz_c2c_knl = prg.zpaxz_c2c self._ab_dot_x_knl = prg.ab_dot_x self._append_c2c_knl = prg.append_c2c self._set_cdouble_to_knl = prg.set_cdouble_to self._mult_elementwise_knl = prg.mult_elementwise_d2c
def load_program(self): #Read all the lines of the cl file into one string (safely) with open("raytraced/Raytracer.cl", "r") as file: source = ''.join(file.readlines()) #Create the opencl program program = Program(self.context, source) #make program options options = "-cl-mad-enable -cl-fast-relaxed-math -Werror -I %s" % os.path.dirname( os.path.abspath(__file__)) #build program program.build(options=options) self.kernel = program.raytrace self.kernel.set_scalar_arg_dtypes([None, None, None, numpy.int32]) #Match OpenCL Dtype. May not work everywhere cltypes.Vertex, c_decl = OpenCL.tools.match_dtype_to_c_struct( self.context.devices[0], 'Vertex', cltypes.Vertex)
def make_add_kernel(self): src = """ #pragma OPENCL EXTENSION cl_khr_fp64 : enable __kernel void add_rand(__global const ${dtype}* a, __global float* out, const float seed) { // modify out such that out = a+ random const int gid = get_global_id(0); float ptr = 0.0f; float randval = fract(sin(gid*112.9898f + seed*237.212f) * 43758.5453f, &ptr); const float min = 0.; const float max = 1.; const float scaledMax = 0.005; const float scaledMin = -0.005; const float scaled = (scaledMax-scaledMin)*(randval-min)/(max-min)+scaledMin; out[gid] = a[gid] + scaled; } """.replace('${dtype}', dtype_str) prog = Program(self.ctx, src).build() return prog.add_rand
def __init__(self): self.angle = 0. self.ch_angles = { "Key_UP": pi / 18., "Key_Down": -pi / 18., "Key_Right": -pi / 180., "Key_Left": pi / 180. } ctx = create_some_context() in_img = lena() h, w = map(int32, in_img.shape[:2]) # in pyopencl 2018.2.2 channel orders other than RGBA # cause segmentation fault i4 = zeros((h, w, 4), dtype=uint8) i4[:, :, 0] = in_img self.in_img_buf = image_from_array(ctx, i4, 4) fmt = ImageFormat(CHO.RGBA, CHANNEL.UNSIGNED_INT8) self.out_img_buf = Image(ctx, MEM.WRITE_ONLY, fmt, shape=(w, h)) prg = Program(ctx, load_cl_text("rotation.cl")).build() self.params = (ctx, self.in_img_buf, self.out_img_buf, h, w, prg)
def init_particle_methods(self): self.init_generic_methods() self.set_global_working_group_size() self._generator_knl = ThreefryGenerator(context=self.ctx) particles_sources = ''.join( open(src_path + "particles_generic.cl").readlines()) particles_sources = self.block_def_str + particles_sources prg = Program(self.ctx, particles_sources).\ build(options=compiler_options) self._data_align_dbl_knl = prg.data_align_dbl self._data_align_int_knl = prg.data_align_int self._index_and_sum_knl = prg.index_and_sum_in_cell self._sort_knl = prg.sort self._push_xyz_knl = prg.push_xyz self._fill_grid_knl = prg.fill_grid self._profile_by_interpolant_knl = prg.profile_by_interpolant
def __init__(self, ctx, queue: CommandQueue, units, weight_initializer: Initializer = GlorotUniformInitializer, bias_initializer: Initializer = ZeroInitializer, activation='linear', batch_size=64): self.units = units self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.activation = activation self.ctx = ctx self.queue = queue self.is_training = True self.queue = queue self.batch_size = batch_size self.dtype_str = dtype_str self.src = "" for fname in [ f'../nncl/cl/activations/{self.activation}.cl', self.layer_fname, '../nncl/cl/layers/gradient.cl' ]: with open(fname, 'r') as infile: self.src += infile.read() + "\n" self.src = Template(self.src).render( activation='activation_' + self.activation, derivative='derivative_' + self.activation, dtype=self.dtype_str) try: self.prog = Program(self.ctx, self.src).build() except cl.cffi_cl.RuntimeError as e: # print(self.src, flush=True) print(e, file=sys.stderr, flush=True) exit(1) self.forward_krnl = self.prog.layer_forward self._grad_krnl = self.prog.get_gradients
except (IndexError, ValueError): angle = pi / 4 ctx = create_some_context() filter_buf = Buffer(ctx, MEM.READ_ONLY | MEM.COPY_HOST_PTR, hostbuf=GAUSSIAN_BLUR) in_img = lena() h, w = map(int32, in_img.shape[:2]) # in pyopencl 2018.2.2 channel orders other than RGBA cause segmentation fault i4 = zeros((h, w, 4), dtype=uint8) i4[:, :, 0] = in_img in_img_buf = image_from_array(ctx, i4, 4) fmt = ImageFormat(CHO.RGBA, CHANNEL.UNSIGNED_INT8) out_img_buf = Image(ctx, MEM.WRITE_ONLY, fmt, shape=(w, h)) pt = perf_counter() prg = Program(ctx, load_cl_text("convolution.cl")).build() TIMES["Compilation"] = perf_counter() - pt pt = perf_counter() with CommandQueue(ctx) as queue: prg.convolution(queue, (w, h), None, in_img_buf, out_img_buf, filter_buf, int32(5), Sampler(ctx, False, ADDRESS.CLAMP_TO_EDGE, FILTER.NEAREST)) TIMES["Execution"] = perf_counter() - pt pt = perf_counter() dest = zeros(i4.shape, dtype=uint8) enqueue_copy(queue, dest, out_img_buf, origin=(0, 0), region=(w, h)) TIMES["Copying"] = perf_counter() - pt in_img_buf.release() out_img_buf.release() filter_buf.release() print("\n".join("%s:\t%g" % i for i in TIMES.items()))
find_set(self.x0, self.y0, self.dx, self.dy, *self.params)[:, :, :3], "extent": extent } def __del__(self): self.buf.release() if __name__ == "__main__": from sys import argv try: xm, ym, xw, w, h = map(float, argv[1:]) h = int32(h) w = int32(w) dx = dy = xw / w x0 = xm - xw / 2. y0 = ym + xw * (h / w) / 2. except (IndexError, ValueError): h = w = int32(800) x0, y0 = -2., 1.5 dx = dy = 3. / 800 ctx = create_some_context() fmt = ImageFormat(CHO.RGBA, CHANNEL.UNSIGNED_INT8) buf = Image(ctx, MEM.WRITE_ONLY, fmt, shape=(w, h)) prg = Program(ctx, load_cl_text("mandelbrot.cl")).build() res = find_set(x0, y0, dx, dy, w, h, ctx, buf, prg) buf.release() show_img(res[:, :, :3], True)
try: n, m, p = map(int, argv[1:]) except ValueError: n, m, p = 3, 4, 5 a = random.randint(2, size=(n * m)).astype(float32) b = random.randint(2, size=(m * p)).astype(float32) c = zeros((n * p), dtype=float32) TIMES = {} ctx = create_some_context() a_buf = Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) b_buf = Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b) c_buf = Buffer(ctx, mf.WRITE_ONLY, c.nbytes) pt = perf_counter() prg = Program(ctx, load_cl_text("multiply_matr.cl")).build() TIMES["Compilation"] = perf_counter() - pt pt = perf_counter() with CommandQueue(ctx) as queue: prg.multiply(queue, c.shape, None, uint16(n), uint16(m), uint16(p), a_buf, b_buf, c_buf) TIMES["Execution"] = perf_counter() - pt pt = perf_counter() enqueue_copy(queue, c, c_buf) TIMES["Copying"] = perf_counter() - pt a_buf.release() b_buf.release() c_buf.release() print("matrix A:") print(a.reshape(n, m)) print("matrix B:")
self.angle += self.ch_angles.get(key, 0.) return {"img": rotate(self.angle, *self.params)[:, :, 0]} def __del__(self): print("delete rotate interactor") self.in_img_buf.release() self.out_img_buf.release() if __name__ == "__main__": from sys import argv try: angle = -float(argv[1]) / 180. * pi except (IndexError, ValueError): angle = pi / 4 ctx = create_some_context() in_img = lena() h, w = map(int32, in_img.shape[:2]) # in pyopencl 2018.2.2 channel orders other than RGBA # cause segmentation fault i4 = zeros((h, w, 4), dtype=uint8) i4[:, :, 0] = in_img in_img_buf = image_from_array(ctx, i4, 4) fmt = ImageFormat(CHO.RGBA, CHANNEL.UNSIGNED_INT8) out_img_buf = Image(ctx, MEM.WRITE_ONLY, fmt, shape=(w, h)) prg = Program(ctx, load_cl_text("rotation.cl")).build() res = rotate(angle, ctx, in_img_buf, out_img_buf, h, w, prg) in_img_buf.release() out_img_buf.release() show_img(res[:, :, 0])
#! /usr/bin/env python3 # Calculation of π using quadrature. Using PyOpenCL. # # Copyright © 2012, 2014 Russel Winder from time import time from pyopencl import create_some_context, CommandQueue, Program, Buffer, mem_flags, enqueue_read_buffer import numpy from output import out n = 1000000000 delta = 1.0 / n startTime = time() context = create_some_context() queue = CommandQueue(context) with open('processSlice_opencl.cl', 'r') as f: kernel = Program(context, f.read()).build() # Quadro FX 570 card on Anglides only supports 32-bit operations, hence float not double. results = numpy.array(n, dtype=numpy.float32) buffer = Buffer(context, mem_flags.WRITE_ONLY, results.nbytes) kernel.processSlice(queue, results.shape, None, numpy.int32(n), numpy.float32(delta), buffer) enqueue_read_buffer(queue, buffer, results).wait() pi = 4.0 * delta * numpy.sum(results) elapseTime = time() - startTime out(__file__, pi, n, elapseTime)
def get_elwise_program(context, arguments, operation, name="elwise_kernel", options=None, preamble="", loop_prep="", after_loop="", use_range=False): if use_range: body = r"""//CL// if (step < 0) { for (i = start + (work_group_start + lid)*step; i > stop; i += gsize*step) { %(operation)s; } } else { for (i = start + (work_group_start + lid)*step; i < stop; i += gsize*step) { %(operation)s; } } """ else: body = """//CL// for (i = work_group_start + lid; i < n; i += gsize) { %(operation)s; } """ import re return_match = re.search(r"\breturn\b", operation) if return_match is not None: from warnings import warn warn( "Using a 'return' statement in an element-wise operation will " "likely lead to incorrect results. Use " "PYOPENCL_ELWISE_CONTINUE instead.", stacklevel=3) source = ("""//CL// {preamble} #define PYOPENCL_ELWISE_CONTINUE continue __kernel void {name}({arguments}) {{ int lid = get_local_id(0); int gsize = get_global_size(0); int work_group_start = get_local_size(0)*get_group_id(0); long i; {loop_prep}; {body} {after_loop}; }} """.format( arguments=", ".join(arg.declarator() for arg in arguments), name=name, preamble=preamble, loop_prep=loop_prep, after_loop=after_loop, body=body % dict(operation=operation), )) from pyopencl import Program return Program(context, source).build(options)
enqueue_copy) from numpy import zeros, int32 from misc import create_some_context, load_cl_text, lena import matplotlib.pyplot as plt try: from time import process_time as perf_counter except ImportError: from time import perf_counter TIMES = {} ctx = create_some_context() lenar = lena().astype(int32).flatten() len_buf = Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=lenar) histogram = zeros(256, dtype=int32) h_buf = Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=histogram) pt = perf_counter() prg = Program(ctx, load_cl_text("histogram.cl")).build() TIMES["Compilation"] = perf_counter() - pt pt = perf_counter() with CommandQueue(ctx) as queue: prg.histogram(queue, lenar.shape, None, len_buf, int32(len(lenar)), h_buf) TIMES["Execution"] = perf_counter() - pt pt = perf_counter() enqueue_copy(queue, histogram, h_buf) TIMES["Copying"] = perf_counter() - pt h_buf.release() len_buf.release() print("\n".join("%s:\t%g" % i for i in TIMES.items())) plt.plot(histogram, ",") plt.show()
def get_elwise_program(context, arguments, operation, name="elwise_kernel", options=[], preamble="", loop_prep="", after_loop="", use_range=False): if use_range: body = r"""//CL// if (step < 0) { for (i = start + (work_group_start + lid)*step; i > stop; i += gsize*step) { %(operation)s; } } else { for (i = start + (work_group_start + lid)*step; i < stop; i += gsize*step) { %(operation)s; } } """ else: body = """//CL// for (i = work_group_start + lid; i < n; i += gsize) { %(operation)s; } """ import re return_match = re.search(r"\breturn\b", operation) if return_match is not None: from warnings import warn warn("Using a 'return' statement in an element-wise operation will " "likely lead to incorrect results. Use " "PYOPENCL_ELWISE_CONTINUE instead.", stacklevel=3) source = ("""//CL// %(preamble)s #define PYOPENCL_ELWISE_CONTINUE continue __kernel void %(name)s(%(arguments)s) { int lid = get_local_id(0); int gsize = get_global_size(0); int work_group_start = get_local_size(0)*get_group_id(0); long i; %(loop_prep)s; %(body)s %(after_loop)s; } """ % { "arguments": ", ".join(arg.declarator() for arg in arguments), "name": name, "preamble": preamble, "loop_prep": loop_prep, "after_loop": after_loop, "body": body % dict(operation=operation), }) from pyopencl import Program return Program(context, source).build(options)
def get_elwise_program(context, arguments, operation, name="elwise_kernel", options=[], preamble="", loop_prep="", after_loop="", use_range=False): if use_range: body = r"""//CL// if (step < 0) { for (i = start + (work_item_start + lid)*step; i > stop; i += gsize*step) { %(operation)s; } } else { for (i = start + (work_item_start + lid)*step; i < stop; i += gsize*step) { %(operation)s; } } """ else: body = """//CL// for (i = work_item_start + lid; i < n; i += gsize) { %(operation)s; } """ source = ("""//CL// %(preamble)s __kernel void %(name)s(%(arguments)s) { int lid = get_local_id(0); int gsize = get_global_size(0); int work_item_start = get_local_size(0)*get_group_id(0); long i; %(loop_prep)s; %(body)s %(after_loop)s; } """ % { "arguments": ", ".join(arg.declarator() for arg in arguments), "name": name, "preamble": preamble, "loop_prep": loop_prep, "after_loop": after_loop, "body": body % dict(operation=operation), }) from pyopencl import Program return Program(context, source).build(options)