def test_enqueue_task(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) mf = cl.mem_flags prg = cl.Program( ctx, """ __kernel void reverse(__global const float *in, __global float *out, int n) { for (int i = 0;i < n;i++) { out[i] = in[n - 1 - i]; } } """).build() knl = prg.reverse n = 100 a = np.random.rand(n).astype(np.float32) b = np.empty_like(a) buf1 = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) buf2 = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes) knl.set_args(buf1, buf2, np.int32(n)) cl.enqueue_task(queue, knl) cl.enqueue_copy(queue, b, buf2).wait() assert la.norm(a[::-1] - b) == 0
def test_enqueue_task(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) mf = cl.mem_flags prg = cl.Program(ctx, """ __kernel void reverse(__global const float *in, __global float *out, int n) { for (int i = 0;i < n;i++) { out[i] = in[n - 1 - i]; } } """).build() knl = prg.reverse n = 100 a = np.random.rand(n).astype(np.float32) b = np.empty_like(a) buf1 = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) buf2 = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes) knl.set_args(buf1, buf2, np.int32(n)) cl.enqueue_task(queue, knl) cl.enqueue_copy(queue, b, buf2).wait() assert la.norm(a[::-1] - b) == 0
def fpga_function(self, benchmark=False): for layer in range(0, self.layers): rows_in = self.layer_height[layer] rows_out = self.layer_height[layer + 1] is_last_layer = layer + 1 == self.layers print("Running layer {} ({}->{}) kernel, with relu: {}".format( layer, rows_in, rows_out, 1 if not is_last_layer else 0)) self.kForward.set_args(self.act_buffers[layer], self.weights_buffers[layer], self.bias_buffers[layer], self.act_buffers[layer + 1], self.minibatch_size, rows_in, rows_out, 1 if not is_last_layer else 0) cl.enqueue_task(self.queue, self.kForward).wait() if is_last_layer: print("Running layer {} softmax kernel".format(layer)) self.kForwardSoftMax.set_args(self.act_buffers[layer + 1], self.minibatch_size, rows_out) cl.enqueue_task(self.queue, self.kForwardSoftMax).wait()
def bw_function(self, benchmark=False): for layer in range(self.layers, -1, -1): rows_in = self.layer_height[layer] is_last_layer = layer == self.layers if is_last_layer: #print("Running layer {} first_delta kernel".format(layer)) self.kBackwardFirstDelta.set_args(self.act_buffers[layer], self.ground_truth_buffer, self.delta_buffers[layer], self.minibatch_size, self.layer_height[layer]) cl.enqueue_task(self.queue, self.kBackwardFirstDelta).wait() #print("Running layer {} first_delta kernel. COMPLETED".format(layer)) else: rows_out = self.layer_height[layer + 1] #print("Running layer {} backwards kernel".format(layer)) self.kBackward.set_args( self.act_buffers[layer], self.weights_buffers[layer], self.dW_buffers[layer], self.bias_buffers[layer], self.delta_buffers[layer], self.delta_buffers[layer + 1], self.learn_rate, self.minibatch_size, rows_in, rows_out, layer) cl.enqueue_task(self.queue, self.kBackward).wait()
def main(): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags buf = cl.Buffer(ctx, mf.READ_ONLY, 1000) buf2 = cl.Buffer(ctx, mf.READ_WRITE, 8) prg = cl.Program(ctx, """ __kernel void get_addr(__global const int *in, __global long *out) { *out = (long)in; } """).build() knl = prg.get_addr knl.set_args(buf, buf2) cl.enqueue_task(queue, knl) b = np.empty([1], dtype=np.int64) cl.enqueue_copy(queue, b, buf2).wait() print(b[0]) prg = cl.Program(ctx, """ __kernel void get_addr(__global const int *in, __global long *out) { *out = (long)in; } """).build() knl = prg.get_addr knl.set_args(buf, buf2) cl.enqueue_task(queue, knl) b = np.empty([1], dtype=np.int64) cl.enqueue_copy(queue, b, buf2).wait() print(b[0])
context = cl.Context(devices=[dev]) queue = cl.CommandQueue(context, dev) # Build program in the specified context using the kernel source code prog = cl.Program(context, kernel_src) try: prog.build(options=['-Werror'], devices=[dev]) except: print('Build log:') print(prog.get_build_info(dev, cl.program_build_info.LOG)) raise # Data and device buffers data = np.arange(start=0, stop=NUM_SHORTS, dtype=np.uint16) np.random.shuffle(data) print('Input: ' + str(data)) mf = cl.mem_flags data_buffer = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data) # Execute kernel # radix_sort8(__global ushort8 *global_data) kernel = prog.radix_sort8 kernel.set_arg(0, data_buffer) cl.enqueue_task(queue, kernel) cl.enqueue_copy(queue, dest=data, src=data_buffer, is_blocking=True) print('Output: ' + str(data))
def f(t, y_in, y_out, wait_for=None): knl.set_args(y_in.base_data, y_out.base_data) return cl.enqueue_task(queue, knl, wait_for=wait_for + [start_evt])
def _post_kernel_reduction_task(self, nelems, reduction_operator): assert reduction_operator in [INC, MIN, MAX] def generate_code(): def headers(): if self.dtype == np.dtype('float64'): return """ #if defined(cl_khr_fp64) #if defined(cl_amd_fp64) #pragma OPENCL EXTENSION cl_amd_fp64 : enable #else #pragma OPENCL EXTENSION cl_khr_fp64 : enable #endif #elif defined(cl_amd_fp64) #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif """ else: return "" op = {INC: 'INC', MIN: 'min', MAX: 'max'}[reduction_operator] return """ %(headers)s #define INC(a,b) ((a)+(b)) __kernel void global_%(type)s_%(dim)s_post_reduction ( __global %(type)s* dat, __global %(type)s* tmp, __private int count ) { __private %(type)s accumulator[%(dim)d]; for (int j = 0; j < %(dim)d; ++j) { accumulator[j] = dat[j]; } for (int i = 0; i < count; ++i) { for (int j = 0; j < %(dim)d; ++j) { accumulator[j] = %(op)s(accumulator[j], *(tmp + i * %(dim)d + j)); } } for (int j = 0; j < %(dim)d; ++j) { dat[j] = accumulator[j]; } } """ % {'headers': headers(), 'dim': self.cdim, 'type': self._cl_type, 'op': op} src, kernel = _reduction_task_cache.get( (self.dtype, self.cdim, reduction_operator), (None, None)) if src is None: src = generate_code() prg = cl.Program(_ctx, src).build(options="-Werror") name = "global_%s_%s_post_reduction" % (self._cl_type, self.cdim) kernel = prg.__getattr__(name) _reduction_task_cache[ (self.dtype, self.cdim, reduction_operator)] = (src, kernel) kernel.set_arg(0, self._array.data) kernel.set_arg(1, self._d_reduc_array.data) kernel.set_arg(2, np.int32(nelems)) cl.enqueue_task(_queue, kernel).wait() self._array.get(queue=_queue, ary=self._data) self.state = DeviceDataMixin.BOTH del self._d_reduc_array
# Get device and context, create command queue and program dev = utility.get_default_device() context = cl.Context(devices=[dev], properties=None, dev_type=None, cache_dir=None) queue = cl.CommandQueue(context, dev, properties=None) # Build program in the specified context using the kernel source code prog = cl.Program(context, kernel_src) try: prog.build(options=["-Werror"], devices=[dev], cache_dir=None) except: print("Build log:") print(prog.get_build_info(dev, cl.program_build_info.LOG)) raise # Data and device buffers data = np.arange(start=0, stop=NUM_SHORTS, dtype=np.uint16) np.random.shuffle(data) print("Input: " + str(data)) mf = cl.mem_flags data_buffer = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data) # Execute kernel # radix_sort8(__global ushort8 *global_data) kernel = prog.radix_sort8 kernel.set_arg(0, data_buffer) cl.enqueue_task(queue, kernel) cl.enqueue_copy(queue, dest=data, src=data_buffer, is_blocking=True) print("Output: " + str(data))