def process_args(self, *args): processed = [] events = [] output = ct.c_int() out_like = None for arg in args: if isinstance(arg, np.ndarray): buf, evt = cl.buffer_from_ndarray(self.queue, arg, blocking=False) processed.append(buf) events.append(evt) output = buf.empty_like_this() out_like = arg else: if isinstance(arg, int): processed.append(arg) elif isinstance(arg, float) and isinstance(output, ct.c_int): processed.append(arg) output = ct.c_float() else: raise NotImplementedError( "UnsupportedType: %s" % type(arg) ) if self.output is not None: output, evt = cl.buffer_from_ndarray(self.queue, self.output, blocking=False) out_like = self.output evt.wait() if isinstance(output, cl.cl_mem): processed.append(output) else: processed.append(output.byref) cl.clWaitForEvents(*events) return processed, output, out_like
def __call__(self, input, u, v): output = zeros_like(input.data) events = [] in_buf, in_evt = buffer_from_ndarray(self.queue, input.data, blocking=False) events.append(in_evt) self.kernel.setarg(0, in_buf, sizeof(cl_mem)) u_buf, u_evt = buffer_from_ndarray(self.queue, u.data, blocking=False) events.append(u_evt) self.kernel.setarg(1, u_buf, sizeof(cl_mem)) v_buf, v_evt = buffer_from_ndarray(self.queue, v.data, blocking=False) events.append(v_evt) self.kernel.setarg(2, v_buf, sizeof(cl_mem)) out_buf, out_evt = buffer_from_ndarray(self.queue, output, blocking=False) events.append(out_evt) self.kernel.setarg(3, out_buf, sizeof(cl_mem)) clWaitForEvents(*events) evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size) evt.wait() _, evt = buffer_to_ndarray(self.queue, out_buf, output) evt.wait() return Array(unique_name(), output)
def __call__(self, A): output_array = np.empty(ceil(len(A) / WORK_GROUP_SIZE), np.int32) buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False) output_buffer, output_evt = cl.buffer_from_ndarray(self.queue, output_array, blocking=False) self._c_function(self.queue, self.kernel, buf, output_buffer) B, evt = cl.buffer_to_ndarray(self.queue, output_buffer, like=output_array) return B
def __call__(self, *args): """__call__ :param *args: """ if isinstance(args[0], hmarray): output = empty_like(args[0]) else: output = np.zeros_like(args[0]) # self.kernel.argtypes = tuple( # cl_mem for _ in args + (output, ) # ) + (localmem, ) buffers = [] events = [] for index, arg in enumerate(args + (output, )): if isinstance(arg, hmarray): buffers.append(arg.ocl_buf) else: buf, evt = buffer_from_ndarray(self.queue, arg, blocking=True) # evt.wait() events.append(evt) buffers.append(buf) # self.kernel.setarg(index, buf, sizeof(cl_mem)) cl.clWaitForEvents(*events) cl_error = 0 if isinstance(self.kernel, list): kernels = len(self.kernel) if kernels == 2: cl_error = self._c_function(self.queue, self.kernel[0], self.kernel[1], *buffers) elif kernels == 3: cl_error = self._c_function(self.queue, self.kernel[0], self.kernel[1], self.kernel[2], *buffers) elif kernels == 4: cl_error = self._c_function( self.queue, self.kernel[0], self.kernel[1], self.kernel[2], self.kernel[3], *buffers ) else: cl_error = self._c_function(self.queue, self.kernel, *buffers) if cl.cl_errnum(cl_error) != cl.cl_errnum.CL_SUCCESS: raise StencilException( "Error executing stencil kernel: opencl {} {}".format( cl_error, cl.cl_errnum(cl_error) ) ) if isinstance(output, hmarray): return output buf, evt = buffer_to_ndarray( self.queue, buffers[-1], output ) evt.wait() return buf
def __array_finalize__(self, obj): if obj is None: return if backend in {"ocl", "opencl", "OCL"}: buf, evt = cl.buffer_from_ndarray(queue, obj) evt.wait() self.ocl_buf = buf self.host_dirty = False self.ocl_dirty = False self.register = None
def process_inputs(self, *args): events = [] processed = [] self.kernel.argtypes = tuple(cl_mem for _ in args) for index, arg in enumerate(args): if isinstance(arg, types.common.Array): arg = arg.data buf, evt = buffer_from_ndarray(self.queue, arg, blocking=False) processed.append(buf) events.append(evt) self.kernel.setarg(index, buf, sizeof(cl_mem)) clWaitForEvents(*events) return processed
def set(value): dest = buffer if field in ["value", "grad"] and any(p != (0, 0) for p in self.pad): _slice = [slice(None)] for i, p in enumerate(self.pad): if p != (0, 0): _slice.append(slice(p[0], -p[1])) else: _slice.append(slice(None)) # dest = dest[tuple(_slice)] else: _slice = [slice(None) for _ in dest.shape] if field in self.tiling_info: tiled = value # Handle padding of weights and bias if set using set_value if field in ["weights", "bias"]: padding = list() # generate padding tuple for dim in tiled.shape: padding += ((0,0),) for dim, factor in self.tiling_info[field]: if tiled.shape[dim] % factor != 0: padding[dim] = (0, factor - tiled.shape[dim] % factor) tiled = np.lib.pad(tiled, padding, 'constant') for dim, _ in self.tiling_info[field]: if field in self.batch_fields: dim += 1 if not isinstance(self, ActivationEnsemble): _slice.append(_slice[dim]) _slice[dim] = slice(None) tiled = util.tile(tiled, dim) tiled_shape = list(dest.shape) if not isinstance(self, ActivationEnsemble) or field not in ["value", "grad"]: for dim, factor in self.tiling_info[field]: if field in self.batch_fields: dim += 1 if tiled_shape[dim] < factor: factor = tiled_shape[dim] elif tiled_shape[dim] % factor != 0: raise NotImplementedError() tiled_shape[dim] //= factor tiled_shape.append(factor) dest = dest.reshape(tiled_shape) dest[_slice] = tiled else: dest[_slice] = value if cl_buffer is not None: _, evt = cl.buffer_from_ndarray(latte.config.cl_queue, dest, buf=cl_buffer) evt.wait()
def __call__(self, A): a = time.time() # Initialization and copy from CPU to GPU output_array = np.empty(1, A.dtype) buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False) output_buffer, output_evt = cl.buffer_from_ndarray(self.queue, output_array, blocking=False) b = time.time() # Actual execution of the reduction. self._c_function(self.queue, self.kernel, buf, output_buffer) c = time.time() # Copying the result back from the GPU to the CPU B, evt = cl.buffer_to_ndarray(self.queue, output_buffer, like=output_array) d = time.time() # The true time of execution, exluding copy time is between b and c. print ("True SEJITS Time (excluding copy time): {0} seconds".format(c - b)) # print("overall execution:", d-a, "Initial Copy:", b-a, "Kernel execution:", c-b, "Final Copy:", d-c) return B[0]
def __call__(self, im): output = zeros_like(im.data) in_buf, evt = buffer_from_ndarray(self.queue, im.data, blocking=False) evt.wait() self.kernel.setarg(0, in_buf, sizeof(cl_mem)) out_buf = clCreateBuffer(self.context, output.nbytes) self.kernel.setarg(1, out_buf, sizeof(cl_mem)) evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size) evt.wait() _, evt = buffer_to_ndarray(self.queue, out_buf, output) evt.wait() del in_buf del out_buf return Array(unique_name(), output)
def __call__(self, *args): """__call__ :param *args: """ if self.output is not None: output = self.output self.output = None else: output = np.zeros_like(args[0]) self.kernel.argtypes = tuple(cl_mem for _ in args + (output, )) + (localmem, ) bufs = [] events = [] for index, arg in enumerate(args + (output, )): buf, evt = buffer_from_ndarray(self.queue, arg, blocking=False) # evt.wait() events.append(evt) bufs.append(buf) self.kernel.setarg(index, buf, sizeof(cl_mem)) cl.clWaitForEvents(*events) if self.device.type == cl.cl_device_type.CL_DEVICE_TYPE_GPU: local = 8 else: local = 1 localmem_size = reduce(operator.mul, (local + (self.ghost_depth * 2) for _ in range(args[0].ndim)), sizeof(c_float)) self.kernel.setarg( len(args) + 1, localmem(localmem_size), localmem_size) evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size, tuple(local for _ in range(args[0].ndim))) evt.wait() buf, evt = buffer_to_ndarray(self.queue, bufs[-1], output) evt.wait() for mem in bufs: del mem return buf
def __call__(self, im, num_powers, border): out_shape = [num_powers] + list(im.shape) output = np.empty(out_shape, dtype=np.float32) in_buf, evt = buffer_from_ndarray(self.queue, im.data, blocking=False) evt.wait() self.kernel.setarg(0, in_buf, sizeof(cl_mem)) out_buf = clCreateBuffer(self.queue.context, output.nbytes) self.kernel.setarg(1, out_buf, sizeof(cl_mem)) evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size) evt.wait() self.kernel2.setarg(0, out_buf, sizeof(cl_mem)) for power in range(num_powers): self.kernel2.setarg(1, power, sizeof(cl_int)) evt = clEnqueueNDRangeKernel(self.queue, self.kernel2, self.global_size) evt.wait() _, evt = buffer_to_ndarray(self.queue, out_buf, output) evt.wait() return Array(unique_name(), output)
def __call__(self, A): buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False) self._c_function(self.queue, self.kernel, buf) B, evt = cl.buffer_to_ndarray(self.queue, buf, like=A) return B
def sync_ocl(self): if backend in {"ocl", "opencl", "OCL"}: _, evt = cl.buffer_from_ndarray(queue, self, self.ocl_buf) evt.wait()
def __allocate_buffer(self, device=get_gpu()): queue = self.get_queue(device) self.__buffers[device.value], evt = pycl.buffer_from_ndarray(queue, self, self.get_buffer(device)) return evt
def __call__(self, *args): cacheSize = args[1] cacheSizeInFloats = (int)(cacheSize / sizeof(c_float)) args = args[2:] nPoints, dFeatures = args[0].shape localInitSize = self.init.work_group_size(self.device) numGroups_init = nPoints / localInitSize if ( nPoints % localInitSize == 0) else nPoints / localInitSize + 1 globalInitSize = numGroups_init * localInitSize localFoph1Size = self.foph1.work_group_size(self.device) numGroups_foph1 = nPoints / localFoph1Size if ( nPoints % localFoph1Size == 0) else nPoints / localFoph1Size + 1 globalFoph1Size = numGroups_foph1 * localFoph1Size localFoph2Size = self.foph2.work_group_size(self.device) globalFoph2Size = localFoph2Size localSoph1Size = self.soph1.work_group_size(self.device) numGroups_soph1 = nPoints / localSoph1Size if ( nPoints % localSoph1Size == 0) else nPoints / localSoph1Size + 1 globalSoph1Size = numGroups_soph1 * localSoph1Size localSoph2Size = self.soph2.work_group_size(self.device) globalSoph2Size = localSoph2Size localSoph3Size = self.soph3.work_group_size(self.device) numGroups_soph3 = nPoints / localSoph3Size if ( nPoints % localSoph3Size == 0) else nPoints / localSoph3Size + 3 globalSoph3Size = numGroups_soph3 * localSoph3Size localSoph4Size = self.soph4.work_group_size(self.device) globalSoph4Size = localSoph4Size # print "Init .... Local: %d, Num Groups: %d, Global: %d" % (localInitSize, numGroups_init, globalInitSize) # print "Foph1 ... Local: %d, Num Groups: %d, Global: %d" % (localFoph1Size, numGroups_foph1, globalFoph1Size) # print "Foph2 ... Local & Global: %d" % (localFoph2Size) # print "Soph1 ... Local: %d, Num Groups: %d, Global: %d" % (localSoph1Size, numGroups_soph1, globalSoph1Size) # print "Soph2 ... Local & Global: %d" % (localSoph2Size) # print "Soph3 ... Local: %d, Num Groups: %d, Global: %d" % (localSoph3Size, numGroups_soph3, globalSoph3Size) # print "Soph4 ... Local & Global: %d" % (localSoph4Size) #create buffers from input d_input_data, evt = cl.buffer_from_ndarray(self.queue, args[0], blocking=False) d_input_data_colmajor, evt = cl.buffer_from_ndarray(self.queue, args[0].T, blocking=False) d_labels, evt = cl.buffer_from_ndarray(self.queue, args[1], blocking=False) args = (args[0].ctypes.data_as(POINTER(c_float)), args[1].ctypes.data_as(POINTER(c_int))) + args[2:] # temporary numpy arrays iArray = np.zeros(nPoints, dtype=np.float32) reduceIntsFO = np.zeros(numGroups_foph1, dtype=np.int32) reduceFloatsFO = np.zeros(numGroups_foph1, dtype=np.float32) reduceIntsSO1 = np.zeros(numGroups_soph1, dtype=np.int32) reduceFloatsSO1 = np.zeros(numGroups_soph1, dtype=np.float32) reduceIntsSO3 = np.zeros(numGroups_soph3, dtype=np.int32) reduceFloatsSO3 = np.zeros(numGroups_soph3, dtype=np.float32) results = np.zeros(8, dtype=np.float32) cache = np.zeros(cacheSizeInFloats, dtype=np.float32) # new buffers from scratch d_trainingAlpha, evt = cl.buffer_from_ndarray(self.queue, iArray, blocking=False) d_kernelDiag, evt = cl.buffer_from_ndarray(self.queue, iArray, blocking=False) d_F, evt = cl.buffer_from_ndarray(self.queue, iArray, blocking=False) d_cache, evt = cl.buffer_from_ndarray(self.queue, cache, blocking=False) d_highFsFO, evt = cl.buffer_from_ndarray(self.queue, reduceFloatsFO, blocking=False) d_highIndicesFO, evt = cl.buffer_from_ndarray(self.queue, reduceIntsFO, blocking=False) d_lowFsFO, evt = cl.buffer_from_ndarray(self.queue, reduceFloatsFO, blocking=False) d_lowIndicesFO, evt = cl.buffer_from_ndarray(self.queue, reduceIntsFO, blocking=False) d_highFsSO1, evt = cl.buffer_from_ndarray(self.queue, reduceFloatsSO1, blocking=False) d_highIndicesSO1, evt = cl.buffer_from_ndarray(self.queue, reduceIntsSO1, blocking=False) d_lowFsSO3, evt = cl.buffer_from_ndarray(self.queue, reduceFloatsSO3, blocking=False) d_lowIndicesSO3, evt = cl.buffer_from_ndarray(self.queue, reduceIntsSO3, blocking=False) d_deltaFsSO3, evt = cl.buffer_from_ndarray(self.queue, reduceFloatsSO3, blocking=False) d_results, evt = cl.buffer_from_ndarray(self.queue, results, blocking=False) #return values rho = c_float() nSV = c_int() iterations = c_int() signedAlpha = pointer(c_float()) supportVectors = pointer(c_float()) args = (cacheSizeInFloats, ) + args #add args args += ( # sizes localInitSize, globalInitSize, numGroups_foph1, localFoph1Size, globalFoph1Size, localFoph2Size, globalFoph2Size, numGroups_soph1, localSoph1Size, globalSoph1Size, localSoph2Size, globalSoph2Size, numGroups_soph3, localSoph3Size, globalSoph3Size, localSoph4Size, globalSoph4Size, # buffers d_input_data, d_input_data_colmajor, d_labels, d_trainingAlpha, d_kernelDiag, d_F, d_highFsFO, d_highIndicesFO, d_lowFsFO, d_lowIndicesFO, d_highFsSO1, d_highIndicesSO1, d_lowFsSO3, d_lowIndicesSO3, d_deltaFsSO3, d_results, d_cache, # Ocl queue and kernels self.queue, self.init, self.step1, self.foph1, self.foph2, self.soph1, self.soph2, self.soph3, self.soph4, # return pointers byref(rho), byref(nSV), byref(iterations), byref(signedAlpha), byref(supportVectors)) with Timer() as cfunc: err = self._c_function(*args) print "Actual function took %.6f" % cfunc.interval return rho.value, nSV.value, iterations.value, \ as_array(supportVectors,shape=(nSV.value,dFeatures)),\ as_array(signedAlpha,shape=(nSV.value,))