def __call__(self, input, u, v): output = zeros_like(input.data) events = [] in_buf, in_evt = buffer_from_ndarray(self.queue, input.data, blocking=False) events.append(in_evt) self.kernel.setarg(0, in_buf, sizeof(cl_mem)) u_buf, u_evt = buffer_from_ndarray(self.queue, u.data, blocking=False) events.append(u_evt) self.kernel.setarg(1, u_buf, sizeof(cl_mem)) v_buf, v_evt = buffer_from_ndarray(self.queue, v.data, blocking=False) events.append(v_evt) self.kernel.setarg(2, v_buf, sizeof(cl_mem)) out_buf, out_evt = buffer_from_ndarray(self.queue, output, blocking=False) events.append(out_evt) self.kernel.setarg(3, out_buf, sizeof(cl_mem)) clWaitForEvents(*events) evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size) evt.wait() _, evt = buffer_to_ndarray(self.queue, out_buf, output) evt.wait() return Array(unique_name(), output)
def __call__(self, A): output_array = np.empty(ceil(len(A) / WORK_GROUP_SIZE), np.int32) buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False) output_buffer, output_evt = cl.buffer_from_ndarray(self.queue, output_array, blocking=False) self._c_function(self.queue, self.kernel, buf, output_buffer) B, evt = cl.buffer_to_ndarray(self.queue, output_buffer, like=output_array) return B
def process_output(self, output, out_like=None): if isinstance(output, cl.cl_mem): out, evt = cl.buffer_to_ndarray(self.queue, output, like=out_like) evt.wait() return out else: return output.value
def __call__(self, *args): """__call__ :param *args: """ if isinstance(args[0], hmarray): output = empty_like(args[0]) else: output = np.zeros_like(args[0]) # self.kernel.argtypes = tuple( # cl_mem for _ in args + (output, ) # ) + (localmem, ) buffers = [] events = [] for index, arg in enumerate(args + (output, )): if isinstance(arg, hmarray): buffers.append(arg.ocl_buf) else: buf, evt = buffer_from_ndarray(self.queue, arg, blocking=True) # evt.wait() events.append(evt) buffers.append(buf) # self.kernel.setarg(index, buf, sizeof(cl_mem)) cl.clWaitForEvents(*events) cl_error = 0 if isinstance(self.kernel, list): kernels = len(self.kernel) if kernels == 2: cl_error = self._c_function(self.queue, self.kernel[0], self.kernel[1], *buffers) elif kernels == 3: cl_error = self._c_function(self.queue, self.kernel[0], self.kernel[1], self.kernel[2], *buffers) elif kernels == 4: cl_error = self._c_function( self.queue, self.kernel[0], self.kernel[1], self.kernel[2], self.kernel[3], *buffers ) else: cl_error = self._c_function(self.queue, self.kernel, *buffers) if cl.cl_errnum(cl_error) != cl.cl_errnum.CL_SUCCESS: raise StencilException( "Error executing stencil kernel: opencl {} {}".format( cl_error, cl.cl_errnum(cl_error) ) ) if isinstance(output, hmarray): return output buf, evt = buffer_to_ndarray( self.queue, buffers[-1], output ) evt.wait() return buf
def gpu_to_device(self, device=get_gpu(), wait=True, force=False): if not self.__is_dirty("host") and not force: return print("GPU to DEVICE") _, evt = pycl.buffer_to_ndarray(self.get_queue(device), self.__buffers[device.value], out=self) if wait: evt.wait() else: self.__waiting.append(evt) self.set_dirty('host', False)
def __call__(self, im): output = zeros_like(im.data) in_buf, evt = buffer_from_ndarray(self.queue, im.data, blocking=False) evt.wait() self.kernel.setarg(0, in_buf, sizeof(cl_mem)) out_buf = clCreateBuffer(self.context, output.nbytes) self.kernel.setarg(1, out_buf, sizeof(cl_mem)) evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size) evt.wait() _, evt = buffer_to_ndarray(self.queue, out_buf, output) evt.wait() del in_buf del out_buf return Array(unique_name(), output)
def get(): if cl_buffer is not None: _, evt = cl.buffer_to_ndarray(latte.config.cl_queue, cl_buffer, out=buffer) evt.wait() if field in self.tiling_info: untiled = buffer if field in self.private_info: untiled = untiled[0] shape = untiled.shape tiled_shape = list(shape) if not isinstance(self, ActivationEnsemble) or field not in ["value", "grad"]: for dim, factor in self.tiling_info[field]: if field in self.batch_fields: dim += 1 tiled_shape[dim] //= factor tiled_shape.append(factor) #print(tiled_shape) untiled = untiled.reshape(tiled_shape) for dim, _ in reversed(self.tiling_info[field]): if field in self.batch_fields: dim += 1 untiled = util.untile(untiled, dim) to_return = untiled else: to_return = buffer if "grad_" in field and "grad_inputs" not in field: to_return = to_return[0] if field in ["value", "grad"] and any(p != (0, 0) for p in self.pad): _slice = [slice(None)] for p in self.pad: if p != (0, 0): _slice.append(slice(p[0], -p[1])) else: _slice.append(slice(None)) to_return = to_return[tuple(_slice)] if field in ["value", "grad"] and any(p != (0, 0) for p in self.filter_pad): _slice = [slice(None)] for p in self.filter_pad: if p != (0, 0): _slice.append(slice(p[0], -p[1])) else: _slice.append(slice(None)) to_return = to_return[tuple(_slice)] return to_return
def __call__(self, *args): """__call__ :param *args: """ if self.output is not None: output = self.output self.output = None else: output = np.zeros_like(args[0]) self.kernel.argtypes = tuple(cl_mem for _ in args + (output, )) + (localmem, ) bufs = [] events = [] for index, arg in enumerate(args + (output, )): buf, evt = buffer_from_ndarray(self.queue, arg, blocking=False) # evt.wait() events.append(evt) bufs.append(buf) self.kernel.setarg(index, buf, sizeof(cl_mem)) cl.clWaitForEvents(*events) if self.device.type == cl.cl_device_type.CL_DEVICE_TYPE_GPU: local = 8 else: local = 1 localmem_size = reduce(operator.mul, (local + (self.ghost_depth * 2) for _ in range(args[0].ndim)), sizeof(c_float)) self.kernel.setarg( len(args) + 1, localmem(localmem_size), localmem_size) evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size, tuple(local for _ in range(args[0].ndim))) evt.wait() buf, evt = buffer_to_ndarray(self.queue, bufs[-1], output) evt.wait() for mem in bufs: del mem return buf
def __call__(self, im, num_powers, border): out_shape = [num_powers] + list(im.shape) output = np.empty(out_shape, dtype=np.float32) in_buf, evt = buffer_from_ndarray(self.queue, im.data, blocking=False) evt.wait() self.kernel.setarg(0, in_buf, sizeof(cl_mem)) out_buf = clCreateBuffer(self.queue.context, output.nbytes) self.kernel.setarg(1, out_buf, sizeof(cl_mem)) evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size) evt.wait() self.kernel2.setarg(0, out_buf, sizeof(cl_mem)) for power in range(num_powers): self.kernel2.setarg(1, power, sizeof(cl_int)) evt = clEnqueueNDRangeKernel(self.queue, self.kernel2, self.global_size) evt.wait() _, evt = buffer_to_ndarray(self.queue, out_buf, output) evt.wait() return Array(unique_name(), output)
def __call__(self, A): a = time.time() # Initialization and copy from CPU to GPU output_array = np.empty(1, A.dtype) buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False) output_buffer, output_evt = cl.buffer_from_ndarray(self.queue, output_array, blocking=False) b = time.time() # Actual execution of the reduction. self._c_function(self.queue, self.kernel, buf, output_buffer) c = time.time() # Copying the result back from the GPU to the CPU B, evt = cl.buffer_to_ndarray(self.queue, output_buffer, like=output_array) d = time.time() # The true time of execution, exluding copy time is between b and c. print ("True SEJITS Time (excluding copy time): {0} seconds".format(c - b)) # print("overall execution:", d-a, "Initial Copy:", b-a, "Kernel execution:", c-b, "Final Copy:", d-c) return B[0]
def __call__(self, A): buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False) self._c_function(self.queue, self.kernel, buf) B, evt = cl.buffer_to_ndarray(self.queue, buf, like=A) return B
def process_output(self, out_buf, output): _, evt = buffer_to_ndarray(self.queue, out_buf, output.data) evt.wait() return output
def sync_host(self): if backend in {"ocl", "opencl", "OCL"}: if os.environ.get("HM_BACKEND") in {"omp", "openmp"}: return _, evt = cl.buffer_to_ndarray(queue, self.ocl_buf, self) evt.wait()
def sync_host(self): if backend in {"ocl", "opencl", "OCL"}: if os.environ.get("HM_BACKEND") in {'omp', 'openmp'}: return _, evt = cl.buffer_to_ndarray(queue, self.ocl_buf, self) evt.wait()