def run(self, queue, atol, rtol): qcomp = queue.cl_queue_comp xarr = Array(qcomp, cnt, dtype, data=x.data) yarr = Array(qcomp, cnt, dtype, data=y.data) zarr = Array(qcomp, cnt, dtype, data=z.data) self._retarr = rkern(xarr, yarr, zarr, atol, rtol, queue=qcomp)
def rand(queue, shape, dtype, luxury=None, a=0, b=1): """Return an array of `shape` filled with random values of `dtype` in the range [a,b). """ from pyopencl.array import Array gen = _get_generator(queue, luxury) result = Array(queue, shape, dtype) result.add_event(gen.fill_uniform(result, a=a, b=b)) return result
def rand(queue, shape, dtype, luxury=None, a=0, b=1): """Return an array of `shape` filled with random values of `dtype` in the range [a,b). """ from pyopencl.array import Array gen = _get_generator(queue, luxury) result = Array(queue, shape, dtype) result.add_event( gen.fill_uniform(result, a=a, b=b)) return result
def setup(sizes, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) host_arrays, device_arrays = [], [] for size in sizes: numpy_array = np.random.rand(*size).astype(dtype=dtype) opencl_array = Array(queue, numpy_array.shape, numpy_array.dtype) opencl_array.set(numpy_array) host_arrays.append(numpy_array) device_arrays.append(opencl_array) queue.finish() return queue, host_arrays, device_arrays
def setup_op(queue, A): cla = Array(queue, A.shape, A.dtype) cla.set(A) def matvect(x, y): blas.gemv(queue, cla, x, y, transA=True) return def matvec(x, y): blas.gemv(queue, cla, x, y) return LinearOperator(A.shape, matvec, rmatvec=matvect, dtype=A.dtype)
def __init__(self, sf, omega): '''Param: sf: the freeze out hypersf ds0,ds1,ds2,ds3,vx,vy,veta,etas omega: omega^tau, x, y, etas ''' self.cwd, cwf = os.path.split(__file__) self.mass = 1.115 self.Tfrz = 0.137 self.ctx = cl.create_some_context() self.queue = cl.CommandQueue(self.ctx) src = open('kernel_polarization.cl', 'r').read() self.prg = cl.Program(self.ctx, src).build() # calc umu since they are used for each (Y,pt,phi) vx = sf[:, 4] vy = sf[:, 5] vz = sf[:, 6] v_sqr = vx * vx + vy * vy + vz * vz v_sqr[v_sqr > 1.0] = 0.99999 u0 = 1.0 / np.sqrt(1.0 - v_sqr) self.size_sf = len(sf[:, 0]) h_umu = np.zeros((self.size_sf, 4), dtype=np.float32) h_umu[:, 0] = u0 h_umu[:, 1] = u0 * vx h_umu[:, 2] = u0 * vy h_umu[:, 3] = u0 * vz h_smu = sf[:, 0:4].astype(np.float32) h_etas = sf[:, 7].astype(np.float32) h_omegaY = 0.5 * omega[:, 2].astype(np.float32) mf = cl.mem_flags self.d_smu = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_smu) self.d_umu = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_umu) self.d_omegaY = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_omegaY) self.d_etas = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_etas) self.d_pol = Array(self.queue, self.size_sf, np.float32) self.d_vor = Array(self.queue, self.size_sf, np.float32) self.d_rho = Array(self.queue, self.size_sf, np.float32)
def rand(queue, shape, dtype, luxury=None, a=0, b=1): """Return an array of `shape` filled with random values of `dtype` in the range [a,b). """ if luxury is not None: from warnings import warn warn("Specifying the 'luxury' argument is deprecated and will stop being " "supported in PyOpenCL 2018.x", stacklevel=2) from pyopencl.array import Array gen = _get_generator(queue.context) result = Array(queue, shape, dtype) result.add_event( gen.fill_uniform(result, a=a, b=b)) return result
def getitem_device(self, item): if isinstance(item, slice): item = np.arange(len(self))[item] if is_iterable(item): return CLRaggedArray.from_buffer( self.queue, self.cl_buf, self.starts[item], self.shape0s[item], self.shape1s[item], self.stride0s[item], self.stride1s[item], names=[self.names[i] for i in item], ) else: s = self.dtype.itemsize return Array( self.queue, (self.shape0s[item], self.shape1s[item]), self.dtype, strides=(self.stride0s[item] * s, self.stride1s[item] * s), data=self.cl_buf.data, offset=self.starts[item] * s, )
def inner_rand(queue, shape, dtype, luxury=None, a=0, b=1): from pyopencl.array import Array luxury = kwargs.pop("luxury", None) gen = _get_generator(queue, luxury) result = Array(queue, shape, dtype) gen.fill_uniform(result, a=a, b=b) return result
def forward(self, buf: array.Array, verbose: bool = False): # put x in the buffer size = self.layers[0].input_width # can probably do better here # this only works on pocl because they didn't implement CL_MISALIGNED_SUB_BUFFER_OFFSET # buf = x.get_sub_region(size * idx, size) input_np = buf.get() for idx, l in enumerate(self.layers): l.inputs = input_np.copy() if verbose: print(f"Layer {idx}") print( f"Input Batch: rows={l.batch_size} samples cols={l.input_width} features \n", input_np) buf = l(buf) output = buf.get() if verbose: weights = l.get_weights() bias = l.get_bias() print( f"\nWeights: (rows={l.units} units, cols={l.input_width} inputs)\n", weights) # print("Biases:\n", bias) print( f"\nOutput: (rows={l.batch_size} batch samples cols={l.units} units)\n", output) expected = (np.dot(weights, input_np.T) + bias).T if l.activation == 'relu': expected = np.clip(expected, 0, a_max=None) elif l.activation == 'sigmoid': expected = 1 / (np.exp(-expected) + 1) elif l.activation == 'softmax': exps = np.exp(expected) expected = exps / exps.sum(axis=1)[:, None] print("Expected:\n", expected) input_np = output # output is the output of the last layer return buf
def rand(context, queue, shape, dtype): from pyopencl.array import Array from pyopencl.elementwise import get_elwise_kernel result = Array(context, shape, dtype, queue=queue) if dtype == numpy.float32: func = get_elwise_kernel( context, "float *dest, unsigned int seed", md5_code + """ #define POW_2_M32 (1/4294967296.0f) dest[i] = a*POW_2_M32; if ((i += gsize) < n) dest[i] = b*POW_2_M32; if ((i += gsize) < n) dest[i] = c*POW_2_M32; if ((i += gsize) < n) dest[i] = d*POW_2_M32; """, "md5_rng_float") elif dtype == numpy.float64: func = get_elwise_kernel( context, "double *dest, unsigned int seed", md5_code + """ #define POW_2_M32 (1/4294967296.0) #define POW_2_M64 (1/18446744073709551616.) dest[i] = a*POW_2_M32 + b*POW_2_M64; if ((i += gsize) < n) { dest[i] = c*POW_2_M32 + d*POW_2_M64; } """, "md5_rng_float") elif dtype in [numpy.int32, numpy.uint32]: func = get_elwise_kernel( context, "unsigned int *dest, unsigned int seed", md5_code + """ dest[i] = a; if ((i += gsize) < n) dest[i] = b; if ((i += gsize) < n) dest[i] = c; if ((i += gsize) < n) dest[i] = d; """, "md5_rng_int") else: raise NotImplementedError func(queue, result._global_size, result._local_size, result.data, numpy.random.randint(2**31 - 1), result.size) return result
def getitem_device(self, item): if isinstance(item, slice): item = np.arange(len(self))[item] if is_iterable(item): rval = self.__class__.__new__(self.__class__) rval.queue = self.queue rval.starts = self.starts[item] rval.shape0s = self.shape0s[item] rval.shape1s = self.shape1s[item] rval.stride0s = self.stride0s[item] rval.stride1s = self.stride1s[item] rval.cl_buf = self.cl_buf rval.names = [self.names[i] for i in item] return rval else: s = self.dtype.itemsize return Array( self.queue, (self.shape0s[item], self.shape1s[item]), self.dtype, strides=(self.stride0s[item] * s, self.stride1s[item] * s), data=self.cl_buf.data, offset=self.starts[item] * s)
def rand(context, queue, shape, dtype): from pyopencl.array import Array result = Array(context, shape, dtype, queue=queue) _rand(result, numpy.random.randint(2**31 - 1)) return result
from pyopencl.array import arange, Array from pyopencl.reduction import ReductionKernel import numpy ctx = pyopencl.create_some_context() queue = pyopencl.CommandQueue(ctx) #print dir(cl) #a = arange(queue, 400, dtype=numpy.float32) #b = arange(queue, 400, dtype=numpy.float32) acpu = numpy.zeros((100, 1), dtype=numpy.int32) for i in xrange(0, 100): if i % 5 == 0: acpu[i] = 1 a = Array(queue, (100, 1), numpy.int32) a.set(acpu) queue.finish() krnl = ReductionKernel( ctx, numpy.int32, neutral="0", reduce_expr="a+b", map_expr="x[i]", #*y[i]", arguments="__global int *x") #, __global in *y") my_sum = krnl(a).get() queue.finish() print my_sum
# start up the BLAS blas.setup() # generate some random data on the CPU n = 5 dtype = 'float64' # also supports 'float64' x = np.zeros(n, dtype=dtype) y = np.zeros(n, dtype=dtype) rng = np.random.RandomState(1) # change the seed to see different data x[...] = rng.uniform(-1, 1, size=x.shape) y[...] = rng.uniform(-1, 1, size=y.shape) # allocate OpenCL memory on the device clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) cld = Array(queue, 1, x.dtype) # copy data to device clx.set(x) cly.set(y) # compute a dot product (dot) blas.dot(queue, clx, cly, cld) # check the result print("Expected: ", np.dot(x,y)) print("Actual: ", cld.get()[0]) # tidy up the BLAS
# generate some random data on the CPU m, n = 5, 4 dtype = 'float32' # also supports 'float64' A = np.zeros((m, n), dtype=dtype) x = np.zeros(n, dtype=dtype) y = np.zeros(m, dtype=dtype) rng = np.random.RandomState(1) # change the seed to see different data A[...] = rng.uniform(-1, 1, size=A.shape) x[...] = rng.uniform(-1, 1, size=x.shape) y[...] = rng.uniform(-1, 1, size=y.shape) # allocate OpenCL memory on the device clA = Array(queue, A.shape, A.dtype) clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) # copy data to device clA.set(A) clx.set(x) # compute a matrix-vector product (gemv) blas.gemv(queue, clA, clx, cly) # check the result print("Expected: ", np.dot(A, x)) print("Actual: ", cly.get()) # try a matrix-vector product with the transpose
import numpy as np import pyopencl as cl from pyopencl.array import Array import pyclblast # Settings for this sample dtype = 'float32' print("# Setting up OpenCL") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) print("# Setting up Numpy arrays") m, n, k = 2, 3, 4 a = np.random.rand(m, k).astype(dtype=dtype) b = np.random.rand(k, n).astype(dtype=dtype) c = np.random.rand(m, n).astype(dtype=dtype) print("# Setting up OpenCL arrays") cla = Array(queue, a.shape, a.dtype) clb = Array(queue, b.shape, b.dtype) clc = Array(queue, c.shape, c.dtype) cla.set(a) clb.set(b) clc.set(c) print("# Example level-3 operation: GEMM") pyclblast.gemm(queue, m, n, k, cla, clb, clc, a_ld=k, b_ld=n, c_ld=n) print("# Matrix C result: %s" % clc.get()) print("# Expected result: %s" % (np.dot(a, b)))
def to_ocl(a): cla = Array(queue, a.shape, a.dtype) cla.set(a) return cla
# start up the BLAS blas.setup() # generate some random data on the CPU n = 5 dtype = 'float64' # also supports 'float64' x = np.zeros(n, dtype=dtype) y = np.zeros(n, dtype=dtype) rng = np.random.RandomState(1) # change the seed to see different data x[...] = rng.uniform(-1, 1, size=x.shape) y[...] = rng.uniform(-1, 1, size=y.shape) # allocate OpenCL memory on the device clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) cld = Array(queue, 1, x.dtype) # copy data to device clx.set(x) cly.set(y) # compute a dot product (dot) blas.dot(queue, clx, cly, cld) # check the result print("Expected: ", np.dot(x, y)) print("Actual: ", cld.get()[0]) # tidy up the BLAS
n = 4 print("# Setting up OpenCL") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) print("# Setting up Numpy arrays") x = np.random.rand(n * batch_count).astype(dtype=dtype) y = np.random.rand(n * batch_count).astype(dtype=dtype) print("# Batch offsets: next after each other") x_offsets = [0, n] y_offsets = [0, n] print("# Setting up OpenCL arrays") clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) clx.set(x) cly.set(y) print("# Example level-1 batched operation: AXPY-batched") assert len(alphas) == len(x_offsets) == len(y_offsets) == batch_count pyclblast.axpyBatched(queue, n, clx, cly, alphas, x_offsets, y_offsets) queue.finish() print("# Full result for vector y: %s" % str(cly.get())) for i in range(batch_count): result = alphas[i] * x[x_offsets[i]:x_offsets[i] + n] + y[y_offsets[i]:y_offsets[i] + n] print("# Expected result batch #%d: %s" % (i, str(result)))
import pyopencl from pyopencl.array import arange, Array from pyopencl.reduction import ReductionKernel import numpy ctx = pyopencl.create_some_context() queue = pyopencl.CommandQueue(ctx) #print dir(cl) #a = arange(queue, 400, dtype=numpy.float32) #b = arange(queue, 400, dtype=numpy.float32) acpu = numpy.zeros((100, 1), dtype=numpy.int32) for i in xrange(0,100): if i % 5 == 0: acpu[i] = 1 a = Array(queue, (100,1), numpy.int32) a.set(acpu) queue.finish() krnl = ReductionKernel(ctx, numpy.int32, neutral="0", reduce_expr="a+b", map_expr="x[i]", #*y[i]", arguments="__global int *x")#, __global in *y") my_sum = krnl(a).get() queue.finish() print my_sum
def fft(self, src: cla.Array, dest: cla.Array = None): """ Compute the forward FFT :param src: the source pyopencl Array :param dest: the destination pyopencl Array. Should be None for an inplace transform :return: the transformed array. For a R2C inplace transform, the complex view of the array is returned. """ if self.inplace: if dest is not None: if src.data.int_ptr != dest.data.int_ptr: raise RuntimeError( "VkFFTApp.fft: dest is not None but this is an inplace transform" ) if self.batch_shape is not None: s = src.reshape(self.batch_shape) else: s = src _vkfft_opencl.fft(self.app, int(s.data.int_ptr), int(s.data.int_ptr), int(self.queue.int_ptr)) if self.norm == "ortho": if self.precision == 2: src *= np.float16(self._get_fft_scale(norm=0)) elif self.precision == 4: src *= np.float32(self._get_fft_scale(norm=0)) elif self.precision == 8: src *= np.float64(self._get_fft_scale(norm=0)) if self.r2c: if src.dtype == np.float32: return src.view(dtype=np.complex64) elif src.dtype == np.float64: return src.view(dtype=np.complex128) return src else: if dest is None: raise RuntimeError( "VkFFTApp.fft: dest is None but this is an out-of-place transform" ) elif src.data.int_ptr == dest.data.int_ptr: raise RuntimeError( "VkFFTApp.fft: dest and src are identical but this is an out-of-place transform" ) if self.r2c: assert (src.size == dest.size // dest.shape[-1] * 2 * (dest.shape[-1] - 1)) if self.batch_shape is not None: s = src.reshape(self.batch_shape) if self.r2c: c_shape = tuple( list(self.batch_shape[:-1]) + [self.batch_shape[-1] // 2 + 1]) d = dest.reshape(c_shape) else: d = dest.reshape(self.batch_shape) else: s, d = src, dest _vkfft_opencl.fft(self.app, int(s.data.int_ptr), int(d.data.int_ptr), int(self.queue.int_ptr)) if self.norm == "ortho": if self.precision == 2: dest *= np.float16(self._get_fft_scale(norm=0)) elif self.precision == 4: dest *= np.float32(self._get_fft_scale(norm=0)) elif self.precision == 8: dest *= np.float64(self._get_fft_scale(norm=0)) return dest
from pyopencl.array import Array import pyclblast from datetime import datetime if __name__ == "__main__": # Set up pyopencl: ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # Set up a basic sgemm example: m, n, k = 2, 3, 4 a = np.random.rand(m, k).astype(dtype=np.float32) b = np.random.rand(k, n).astype(dtype=np.float32) c = np.empty((m, n), np.float32) cla = Array(queue, a.shape, a.dtype) clb = Array(queue, b.shape, b.dtype) clc = Array(queue, c.shape, c.dtype) cla.set(a) clb.set(b) clc.set(c) # Perform sgemm on these matrices, overriding the CLBlast parameters. In this example, we'll # just change the 'MWG' parameter a couple of times: params = { "KWG": 32, "KWI": 2, "MDIMA": 8, "MDIMC": 8, "MWG": 64, "NDIMB": 8, "NDIMC": 8, "NWG": 64, "SA": 0, "SB": 0, "STRM": 0, "STRN": 0, "VWM": 4, "VWN": 1 } for mwg in (32, 64, 256): print("Running sgemm tuned with MWG = %d" % mwg) params["MWG"] = mwg pyclblast.override_parameters(ctx.devices[0], 'Xgemm', 32, params) pyclblast.gemm(queue, m, n, k, cla, clb, clc, a_ld=k, b_ld=n, c_ld=n)
def ifft(self, src: cla.Array, dest: cla.Array = None): """ Compute the backward FFT :param src: the source GPUarray :param dest: the destination GPUarray. Should be None for an inplace transform :return: the transformed array. For a C2R inplace transform, the float view of the array is returned. """ if self.inplace: if dest is not None: if src.data.int_ptr != dest.data.int_ptr: raise RuntimeError( "VkFFTApp.fft: dest!=src but this is an inplace transform" ) if self.batch_shape is not None: if self.r2c: src_shape = tuple( list(self.batch_shape[:-1]) + [self.batch_shape[-1] // 2]) s = src.reshape(src_shape) else: s = src.reshape(self.batch_shape) else: s = src _vkfft_opencl.ifft(self.app, int(s.data.int_ptr), int(s.data.int_ptr), int(self.queue.int_ptr)) if self.norm == "ortho": if self.precision == 2: src *= np.float16(self._get_ifft_scale(norm=0)) elif self.precision == 4: src *= np.float32(self._get_ifft_scale(norm=0)) elif self.precision == 8: src *= np.float64(self._get_ifft_scale(norm=0)) if self.r2c: if src.dtype == np.complex64: return src.view(dtype=np.float32) elif src.dtype == np.complex128: return src.view(dtype=np.float64) return src if not self.inplace: if dest is None: raise RuntimeError( "VkFFTApp.ifft: dest is None but this is an out-of-place transform" ) elif src.data.int_ptr == dest.data.int_ptr: raise RuntimeError( "VkFFTApp.ifft: dest and src are identical but this is an out-of-place transform" ) if self.r2c: assert (dest.size == src.size // src.shape[-1] * 2 * (src.shape[-1] - 1)) # Special case, src and dest buffer sizes are different, # VkFFT is configured to go back to the source buffer if self.batch_shape is not None: src_shape = tuple( list(self.batch_shape[:-1]) + [self.batch_shape[-1] // 2 + 1]) s = src.reshape(src_shape) d = dest.reshape(self.batch_shape) else: s, d = src, dest _vkfft_opencl.ifft(self.app, int(d.data.int_ptr), int(s.data.int_ptr), int(self.queue.int_ptr)) else: if self.batch_shape is not None: s = src.reshape(self.batch_shape) d = dest.reshape(self.batch_shape) else: s, d = src, dest _vkfft_opencl.ifft(self.app, int(s.data.int_ptr), int(d.data.int_ptr), int(self.queue.int_ptr)) if self.norm == "ortho": if self.precision == 2: dest *= np.float16(self._get_ifft_scale(norm=0)) elif self.precision == 4: dest *= np.float32(self._get_ifft_scale(norm=0)) elif self.precision == 8: dest *= np.float64(self._get_ifft_scale(norm=0)) return dest
from __future__ import print_function import numpy as np import pyopencl as cl from pyopencl.array import Array import pyopencl_blas as blas ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # need to initialize the library blas.setup() dtype = 'float32' # also supports 'float64' x = np.array([1, 2, 3, 4], dtype=dtype) y = np.array([4, 3, 2, 1], dtype=dtype) clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) clx.set(x) cly.set(y) # call a BLAS function on the arrays blas.axpy(queue, clx, cly, alpha=0.8) print("Expected: ", 0.8 * x + y) print("Actual: ", cly.get()) # clean up the library when finished blas.teardown()
import pyclblast # Settings for this sample dtype = 'float32' m, n = 4, 3 alpha = 1.0 beta = 0.0 print("# Setting up OpenCL") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) print("# Setting up Numpy arrays") a = np.random.rand(m, n).astype(dtype=dtype) x = np.random.rand(n).astype(dtype=dtype) y = np.random.rand(m).astype(dtype=dtype) print("# Setting up OpenCL arrays") cla = Array(queue, a.shape, a.dtype) clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) cla.set(a) clx.set(x) cly.set(y) print("# Example level-2 operation: GEMV") pyclblast.gemv(queue, m, n, cla, clx, cly, a_ld=n, alpha=alpha, beta=beta) queue.finish() print("# Result for vector y: %s" % cly.get()) print("# Expected result: %s" % (alpha * np.dot(a, x) + beta * y))
A = np.zeros((n, n), dtype=dtype) x = np.zeros(n, dtype=dtype) x1 = np.zeros(n, dtype=dtype) x2 = np.zeros(n, dtype=dtype) rng = np.random.RandomState(1) # change the seed to see different data A[...] = rng.uniform(-1, 1, size=A.shape) x[...] = rng.uniform(-1, 1, size=x.shape) x1[...] = rng.uniform(-1, 1, size=x1.shape) x2[...] = rng.uniform(-1, 1, size=x2.shape) A_upper = np.triu(A) A = np.tril(A) # allocate OpenCL memory on the device clA = Array(queue, A.shape, A.dtype) clA_upper = Array(queue, A.shape, A.dtype) clx = Array(queue, x.shape, x.dtype) clx1 = Array(queue, x1.shape, x1.dtype) clx2 = Array(queue, x2.shape, x2.dtype) # copy data to device clA.set(A) clA_upper.set(A_upper) clx.set(x) # compute a triangular solve (trsv) blas.trsv(queue, clA, clx) # check the result print("Expected: ", np.linalg.solve(A, x))