def linspace(start, stop, num, dtype=np.float64, backend='opencl', endpoint=True): if not type(num) == int: raise TypeError("num should be an integer but got %s" % type(num)) if num <= 0: raise ValueError("Number of samples, %s, must be positive." % num) if backend == 'opencl': import pyopencl.array as gpuarray from .opencl import get_queue if endpoint: delta = (stop - start) / (num - 1) else: delta = (stop - start) / num out = gpuarray.arange(get_queue(), 0, num, 1, dtype=dtype) out = out * delta + start elif backend == 'cuda': import pycuda.gpuarray as gpuarray import pycuda.autoinit if endpoint: delta = (stop - start) / (num - 1) else: delta = (stop - start) / num out = gpuarray.arange(0, num, 1, dtype=dtype) out = out * delta + start else: out = np.linspace(start, stop, num, endpoint=endpoint, dtype=dtype) return wrap_array(out, backend)
def test_fancy_indexing(ctx_factory): if _PYPY: pytest.xfail("numpypy: multi value setting is not supported") context = ctx_factory() queue = cl.CommandQueue(context) numpy_dest = np.zeros((4,), np.int32) numpy_idx = np.arange(3, 0, -1, dtype=np.int32) numpy_src = np.arange(8, 10, dtype=np.int32) numpy_dest[numpy_idx] = numpy_src cl_dest = cl_array.zeros(queue, (4,), np.int32) cl_idx = cl_array.arange(queue, 3, 0, -1, dtype=np.int32) cl_src = cl_array.arange(queue, 8, 10, dtype=np.int32) cl_dest[cl_idx] = cl_src assert np.all(numpy_dest == cl_dest.get()) cl_idx[1] = 3 cl_idx[2] = 2 numpy_idx[1] = 3 numpy_idx[2] = 2 numpy_dest[numpy_idx] = numpy_src cl_dest[cl_idx] = cl_src assert np.all(numpy_dest == cl_dest.get())
def test_fancy_indexing(ctx_factory): if _PYPY: pytest.xfail("numpypy: multi value setting is not supported") context = ctx_factory() queue = cl.CommandQueue(context) numpy_dest = np.zeros((4,), np.int32) numpy_idx = np.arange(3, 0, -1, dtype=np.int32) numpy_src = np.arange(8, 11, dtype=np.int32) numpy_dest[numpy_idx] = numpy_src cl_dest = cl_array.zeros(queue, (4,), np.int32) cl_idx = cl_array.arange(queue, 3, 0, -1, dtype=np.int32) cl_src = cl_array.arange(queue, 8, 11, dtype=np.int32) cl_dest[cl_idx] = cl_src assert np.all(numpy_dest == cl_dest.get()) cl_idx[1] = 3 cl_idx[2] = 2 numpy_idx[1] = 3 numpy_idx[2] = 2 numpy_dest[numpy_idx] = numpy_src cl_dest[cl_idx] = cl_src assert np.all(numpy_dest == cl_dest.get())
def test_take(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) idx = cl_array.arange(context, queue, 0, 200000, 2, dtype=numpy.uint32) a = cl_array.arange(context, queue, 0, 600000, 3, dtype=numpy.float32) result = cl_array.take(a, idx) assert ((3*idx).get() == result.get()).all()
def test_take(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) idx = cl_array.arange(queue, 0, 200000, 2, dtype=np.uint32) a = cl_array.arange(queue, 0, 600000, 3, dtype=np.float32) result = cl_array.take(a, idx) assert ((3 * idx).get() == result.get()).all()
def arange(start, stop, step, dtype=np.int32, backend='cython'): if backend == 'opencl': import pyopencl.array as gpuarray from .opencl import get_queue out = gpuarray.arange(get_queue(), start, stop, step, dtype=dtype) elif backend == 'cuda': import pycuda.gpuarray as gpuarray out = gpuarray.arange(start, stop, step, dtype=dtype) else: out = np.arange(start, stop, step, dtype=dtype) return wrap_array(out, backend)
def test_ldexp(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) for s in sizes: a = cl_array.arange(queue, s, dtype=np.float32) a2 = cl_array.arange(queue, s, dtype=np.float32) * 1e-3 b = clmath.ldexp(a, a2) a = a.get() a2 = a2.get() b = b.get() for i in range(s): assert math.ldexp(a[i], int(a2[i])) == b[i]
def test_atan2pi(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) for s in sizes: a = (cl_array.arange(queue, s, dtype=np.float32) - np.float32(s / 2)) / 100 a2 = (s / 2 - 1 - cl_array.arange(queue, s, dtype=np.float32)) / 100 b = clmath.atan2pi(a, a2) a = a.get() a2 = a2.get() b = b.get() for i in range(s): assert abs(math.atan2(a[i], a2[i]) / math.pi - b[i]) < 1e-6
def test_fmod(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) for s in sizes: a = cl_array.arange(queue, s, dtype=np.float32) / 10 a2 = cl_array.arange(queue, s, dtype=np.float32) / 45.2 + 0.1 b = clmath.fmod(a, a2) a = a.get() a2 = a2.get() b = b.get() for i in range(s): assert math.fmod(a[i], a2[i]) == b[i]
def test_fmod(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) for s in sizes: a = cl_array.arange(context, queue, s, dtype=numpy.float32)/10 a2 = cl_array.arange(context, queue, s, dtype=numpy.float32)/45.2 + 0.1 b = clmath.fmod(a, a2) a = a.get() a2 = a2.get() b = b.get() for i in range(s): assert math.fmod(a[i], a2[i]) == b[i]
def test_ldexp(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) for s in sizes: a = cl_array.arange(context, queue, s, dtype=numpy.float32) a2 = cl_array.arange(context, queue, s, dtype=numpy.float32)*1e-3 b = clmath.ldexp(a,a2) a = a.get() a2 = a2.get() b = b.get() for i in range(s): assert math.ldexp(a[i], int(a2[i])) == b[i]
def test(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) gpu_func = getattr(clmath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) if has_double_support(context.devices[0]): if use_complex: dtypes = [np.float32, np.float64, np.complex64, np.complex128] else: dtypes = [np.float32, np.float64] else: if use_complex: dtypes = [np.float32, np.complex64] else: dtypes = [np.float32] for s in sizes: for dtype in dtypes: dtype = np.dtype(dtype) args = cl_array.arange(queue, a, b, (b - a) / s, dtype=dtype) if dtype.kind == "c": args = args + dtype.type(1j) * args gpu_results = gpu_func(args).get() cpu_results = cpu_func(args.get()) my_threshold = threshold if dtype.kind == "c" and isinstance(use_complex, float): my_threshold = use_complex max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= my_threshold).all(), (max_err, name, dtype)
def setup_arrays(self, nrays, nsamples, cutoff): prog_params = (nrays, nsamples, cutoff) if prog_params in self.array_cache: return self.array_cache[prog_params] else: arrays = ArraySet() arrays.scratch = cla.empty(self.queue, (nsamples, nrays), dtype=np.float32, allocator=self.memory_pool) arrays.result = cla.empty(self.queue, (nrays, ), dtype=np.int32, allocator=self.memory_pool) arrays.pre_cutoff = cla.empty(self.queue, (nrays, cutoff), dtype=np.float32, allocator=self.memory_pool) arrays.pre_cutoff_squared = cla.empty_like(arrays.pre_cutoff) arrays.idx = cla.arange(self.queue, 0, cutoff * nrays, 1, dtype=np.int32, allocator=self.memory_pool) self.array_cache[prog_params] = arrays return arrays
def sim_health_index(n_runs): # Set up OpenCL context and command queue ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mem_pool = cltools.MemoryPool(cltools.ImmediateAllocator(queue)) t0 = time.time() rho = 0.5 mu = 3.0 sigma = 1.0 z_0 = mu # Generate an array of Normal Random Numbers on GPU of length n_sims*n_steps n_steps = int(4160) #4160 rand_gen = clrand.PhiloxGenerator(ctx) ran = rand_gen.normal(queue, (n_runs * n_steps), np.float32, mu=0, sigma=1.0) # Establish boundaries for each simulated walk (i.e. start and end) # Necessary so that we perform scan only within rand walks and not between seg_boundaries = [1] + [0] * (n_steps - 1) seg_boundaries = np.array(seg_boundaries, dtype=np.uint8) seg_boundary_flags = np.tile(seg_boundaries, int(n_runs)) seg_boundary_flags = cl_array.to_device(queue, seg_boundary_flags) # GPU: Define Segmented Scan Kernel, scanning simulations: f(n-1) + f(n) prefix_sum = GenericScanKernel( ctx, np.float32, arguments="__global float *ary, __global char *segflags, " "__global float *out, float rho, float mu", input_expr="segflags[i] ? (ary[i]+mu):(ary[i]+(1-rho)*mu)", scan_expr="across_seg_boundary ? (b):(rho*a+b)", neutral="0", is_segment_start_expr="segflags[i]", output_statement="out[i] = item", options=[]) dev_result = cl_array.arange(queue, len(ran), dtype=np.float32, allocator=mem_pool) # Enqueue and Run Scan Kernel prefix_sum(ran, seg_boundary_flags, dev_result, rho, mu) # Get results back on CPU to plot and do final calcs, just as in Lab 1 health_index_all = (dev_result.get().reshape(n_runs, n_steps).transpose()) final_time = time.time() time_elapsed = final_time - t0 print("Simulated %d Health Index in: %f seconds" % (n_runs, time_elapsed)) #print(health_index_all) #print(ran.reshape(n_runs, n_steps).transpose()) #plt.plot(health_index_all) return
def align_and_damp(self, comps_align): if self.Args['Np_stay'] == 0: for comp in comps_align + [ 'sort_indx', ]: self.DataDev[comp] = self.dev_arr( shape=0, dtype=self.DataDev[comp].dtype) self.reset_num_parts() return WGS, WGS_tot = self.get_wgs(self.Args['Np_stay']) for comp in comps_align: buff_parts = self.dev_arr(dtype=self.DataDev[comp].dtype, shape=(self.Args['Np_stay'], )) self._data_align_dbl_knl(self.queue, (WGS_tot, ), (WGS, ), self.DataDev[comp].data, buff_parts.data, self.DataDev['sort_indx'].data, np.uint32(self.Args['Np_stay'])).wait() self.DataDev[comp] = buff_parts self.DataDev['sort_indx'] = arange(self.queue, 0, self.Args['Np_stay'], 1, dtype=np.uint32) self.reset_num_parts()
def test(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) gpu_func = getattr(clmath, name) cpu_func = getattr(numpy, numpy_func_names.get(name, name)) if has_double_support(context.devices[0]): dtypes = [numpy.float32, numpy.float64] else: dtypes = [numpy.float32] for s in sizes: for dtype in dtypes: args = cl_array.arange(context, queue, a, b, (b - a) / s, dtype=numpy.float32) gpu_results = gpu_func(args).get() cpu_results = cpu_func(args.get()) max_err = numpy.max(numpy.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), \ (max_err, name, dtype)
def test_arange(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) n = 5000 a = cl_array.arange(queue, n, dtype=np.float32) assert (np.arange(n, dtype=np.float32) == a.get()).all()
def test_bitonic_argsort(ctx_factory, size, dtype): import sys is_pypy = "__pypy__" in sys.builtin_module_names if not size and is_pypy: # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array pytest.xfail("pypy doesn't seem to handle as_strided " "on zero-sized arrays very well") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) device = queue.device if device.platform.vendor == "The pocl project" \ and device.type & cl.device_type.GPU: pytest.xfail("bitonic argsort fails on POCL + Nvidia," "at least the K40, as of pocl 1.6, 2021-01-20") dev = ctx.devices[0] if (dev.platform.name == "Portable Computing Language" and sys.platform == "darwin"): pytest.xfail("Bitonic sort crashes on Apple POCL") if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") if (dev.platform.name == "Intel(R) OpenCL" and size == 0): pytest.xfail("size-0 arange fails on Intel CL") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms = m.copy() # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237 if (dev.platform.name == "Portable Computing Language" and cl.get_cl_header_version() < (1, 2)): ms.finish() index.finish() ms, evt = sorterm(ms, idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def setup_arrays(self, nrays, nsamples, cutoff): prog_params = (nrays, nsamples, cutoff) if prog_params in self.array_cache: return self.array_cache[prog_params] else: arrays = ArraySet() arrays.scratch = cla.empty(self.queue, (nsamples, nrays), dtype=np.float32, allocator=self.memory_pool) arrays.result = cla.empty(self.queue, (nrays,), dtype=np.int32, allocator=self.memory_pool) arrays.pre_cutoff = cla.empty(self.queue, (nrays, cutoff), dtype=np.float32, allocator=self.memory_pool) arrays.pre_cutoff_squared = cla.empty_like(arrays.pre_cutoff) arrays.idx = cla.arange(self.queue, 0, cutoff * nrays, 1, dtype=np.int32, allocator=self.memory_pool) self.array_cache[prog_params] = arrays return arrays
def test_arange(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) n = 5000 a = cl_array.arange(context, queue, n, dtype=numpy.float32) assert (numpy.arange(n, dtype=numpy.float32) == a.get()).all()
def test_bitonic_argsort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64): pytest.xfail("Double precision bitonic sort doesn't work on POCL") import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms, evt = sorterm(m.copy(), idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def test_bitonic_argsort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64): pytest.xfail("Double precision bitonic sort doesn't work on POCL") import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size, ), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms, evt = sorterm(m.copy(), idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def arange(start, stop, step, dtype=None, backend='cython'): if backend == 'opencl': import pyopencl.array as gpuarray dev_array = gpuarray.arange(get_queue(), start, stop, step, dtype=dtype) elif backend == 'cuda': import pycuda.gpuarray as gpuarray dev_array = gpuarray.arange(start, stop, step, dtype=dtype) else: return Array(np.arange(start, stop, step, dtype=dtype)) wrapped_array = Array() wrapped_array.set_dev_array(dev_array) return wrapped_array
def test_mem_pool_with_arrays(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) mem_pool = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)) a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool) b_dev = cl_array.to_device(queue, np.arange(2000), allocator=mem_pool) + 4000 assert a_dev.allocator is mem_pool assert b_dev.allocator is mem_pool
def test_reduction_not_first_argument(ctx_factory): # https://github.com/inducer/pyopencl/issues/535 from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) n = 400 a = cl_array.arange(queue, n, dtype=np.float32) b = cl_array.arange(queue, n, dtype=np.float32) from pyopencl.reduction import ReductionKernel krnl = ReductionKernel(context, np.float32, neutral="0", reduce_expr="a+b", map_expr="z*x[i]*y[i]", arguments="float z, __global float *x, __global float *y") my_dot_prod = krnl(0.1, a, b).get() assert abs(my_dot_prod - 0.1*np.sum(np.arange(n)**2)) < 1e-4
def init_indices_buffers(self, image_width, image_height, kernels): mf = cl.mem_flags self.indices_host_buffer = numpy.arange(self.array_size, dtype=numpy.int32) self.indices_gpu_buffer = cl_array.arange(self.queue, 0, self.array_size, dtype=numpy.int32) self.sorted_indices_gpu_buffer = cl_array.zeros_like(self.indices_gpu_buffer) self.indices_host_back_buffers = {} for cell in kernels.keys(): self.indices_host_back_buffers[cell] = {} for centre in kernels[cell].keys(): self.indices_host_back_buffers[cell][centre] = numpy.zeros_like(self.source_host_buffer, dtype=numpy.int32)
def test_multi_put(ctx_factory): if _PYPY: pytest.xfail("numpypy: multi value setting is not supported") context = ctx_factory() queue = cl.CommandQueue(context) cl_arrays = [ cl_array.arange(queue, 0, 3, dtype=np.float32) for i in range(1, 10) ] idx = cl_array.arange(queue, 0, 6, dtype=np.int32) out_arrays = [cl_array.zeros(queue, (10, ), np.float32) for i in range(9)] out_compare = [np.zeros((10, ), np.float32) for i in range(9)] for i, ary in enumerate(out_compare): ary[idx.get()] = np.arange(0, 6, dtype=np.float32) cl_array.multi_put(cl_arrays, idx, out=out_arrays) assert np.all( np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))
def get_coefficients(signal, fs, cfs): """ Executes all of the gammatone logic. Iterates over each input central frequency and executes the filter in OpenCL. :param np.array signal: input signal :param int fs: sampling frequency. :param iter cfs: Some iterator of central frequencies you'd like to filter the wave by. :return: """ samp_g = cla.to_device(q, signal) tpt = np.float64((M_PI + M_PI) / fs) ts_g = cla.arange(q, 0, len(signal), dtype=np.float64) coefficients = [] for cf in cfs: # Calculating the parameters for the given center frequencies tptbw = tpt * erb(cf) * BW_CORRECTION decay = np.exp(-tptbw) gain = np.float64(tptbw) # Setting up memory for everything. qcos = cla.empty_like(ts_g) qsin = cla.empty_like(ts_g) bm_g = cla.empty_like(ts_g) p0r_g = cla.empty_like(ts_g) p0i_g = cla.empty_like(ts_g) ur_g = cla.empty_like(ts_g) ui_g = cla.empty_like(ts_g) # Preparing the imaginary/real cyclical effect q_maker(qcos, qsin, ts_g, tpt, cf) cosx = samp_g * qcos sinx = samp_g * qsin # Performing the filtering operation p0(cosx, p0r_g, decay) p0(sinx, p0i_g, decay) # Preparing the memory to calculate basilar membrane displacement. p1r_g = p0r_g.copy(q) shift(p1r_g) p2r_g = p1r_g.copy(q) shift(p2r_g) p1i_g = p0i_g.copy(q) shift(p1i_g) p2i_g = p1i_g.copy(q) shift(p2i_g) # Calculating Basilar Membrane displacement u0(p0r_g, p1r_g, p2r_g, ur_g, decay) u0(p0i_g, p1i_g, p2i_g, ui_g, decay) bm_maker(ur_g, ui_g, bm_g, gain) # Append to the list cl_x = bm_g.get() # cl_x = normalize(cl_x) coefficients.append(cl_x) return np.row_stack(coefficients)
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=float_): #TODO: create native function if num<2: return array([start]) if endpoint: mnum = num-1 else: mnum = num diff = (stop - start) / mnum if endpoint: stop = stop + diff res = clarray.arange(queue, start, stop, diff, dtype=float_)[:num] res.__class__ = myclArray res.reinit() return res
def test_scalar_array_take_offset(ctx_factory): import pyopencl.array as cla ctx = ctx_factory() cq = cl.CommandQueue(ctx) knl = lp.make_kernel("{:}", """ y = 133*x """, [lp.GlobalArg("x", shape=(), offset=lp.auto), ...]) x_in_base = cla.arange(cq, 42, dtype=np.int32) x_in = x_in_base[13] evt, (out, ) = knl(cq, x=x_in) np.testing.assert_allclose(out.get(), 1729)
def build_scratch(self, imshape): self.scratch = [] self.index_scratch = [] l = np.prod(imshape) self.array_indices = cla.arange(self.queue, 0, l, 1, dtype=np.int32) if l % self.runlen != 0: l += l % self.runlen while l > 1: l /= self.runlen self.scratch.append(cla.empty(self.queue, (l, ), np.float32)) self.index_scratch.append(cla.empty(self.queue, (l, ), np.int32)) self.imshape = imshape
def test_multi_put(ctx_factory): if _PYPY: pytest.xfail("numpypy: multi value setting is not supported") context = ctx_factory() queue = cl.CommandQueue(context) cl_arrays = [ cl_array.arange(queue, 0, 3, dtype=np.float32) for i in range(1, 10) ] idx = cl_array.arange(queue, 0, 6, dtype=np.int32) out_arrays = [ cl_array.zeros(queue, (10,), np.float32) for i in range(9) ] out_compare = [np.zeros((10,), np.float32) for i in range(9)] for i, ary in enumerate(out_compare): ary[idx.get()] = np.arange(0, 3, dtype=np.float32) cl_array.multi_put(cl_arrays, idx, out=out_arrays) assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))
def build_scratch(self, imshape): self.scratch = [] self.index_scratch = [] l = np.prod(imshape) self.array_indices = cla.arange(self.queue, 0, l, 1, dtype=np.int32) if l % self.runlen != 0: l += l % self.runlen while l > 1: l /= self.runlen self.scratch.append(cla.empty(self.queue, (l,), np.float32)) self.index_scratch.append(cla.empty(self.queue, (l,), np.int32)) self.imshape = imshape
def arange(start, stop=0): if not stop: stop = start start = 0 if type(start) == float or type(stop) == float: dtype=float_ elif start>=0: dtype = np.uint32 elif start<0: dtype = np.int32 #print(start, stop, 1, dtype) res = clarray.arange(queue, start, stop, 1, dtype=dtype) res.__class__ = myclArray res.reinit() return res
def test_abs(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) a = -cl_array.arange(context, queue, 111, dtype=numpy.float32) res = a.get() for i in range(111): assert res[i] <= 0 a = abs(a) res = a.get() for i in range(111): assert abs(res[i]) >= 0 assert res[i] == i
def test_modf(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) for s in sizes: a = cl_array.arange(context, queue, s, dtype=numpy.float32)/10 fracpart, intpart = clmath.modf(a) a = a.get() intpart = intpart.get() fracpart = fracpart.get() for i in range(s): fracpart_true, intpart_true = math.modf(a[i]) assert intpart_true == intpart[i] assert abs(fracpart_true - fracpart[i]) < 1e-4
def test_bitonic_argsort(ctx_factory, size, dtype): import sys is_pypy = '__pypy__' in sys.builtin_module_names if not size and is_pypy: # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array pytest.xfail("pypy doesn't seem to handle as_strided " "on zero-sized arrays very well") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Portable Computing Language" and sys.platform == "darwin"): pytest.xfail("Bitonic sort crashes on Apple POCL") if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail( "Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size, ), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms, evt = sorterm(m.copy(), idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def test_frexp(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) for s in sizes: a = cl_array.arange(context, queue, s, dtype=numpy.float32)/10 significands, exponents = clmath.frexp(a) a = a.get() significands = significands.get() exponents = exponents.get() for i in range(s): sig_true, ex_true = math.frexp(a[i]) assert sig_true == significands[i] assert ex_true == exponents[i]
def test_abs(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) a = -cl_array.arange(context, queue, 111, dtype=numpy.float32) res = a.get() for i in range(111): assert res[i] <= 0 a = abs(a) res = a.get() for i in range (111): assert abs(res[i]) >= 0 assert res[i] == i
def test_frexp(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) for s in sizes: a = cl_array.arange(queue, s, dtype=np.float32) / 10 significands, exponents = clmath.frexp(a) a = a.get() significands = significands.get() exponents = exponents.get() for i in range(s): sig_true, ex_true = math.frexp(a[i]) assert sig_true == significands[i] assert ex_true == exponents[i]
def test_modf(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) for s in sizes: a = cl_array.arange(queue, s, dtype=np.float32) / 10 fracpart, intpart = clmath.modf(a) a = a.get() intpart = intpart.get() fracpart = fracpart.get() for i in range(s): fracpart_true, intpart_true = math.modf(a[i]) assert intpart_true == intpart[i] assert abs(fracpart_true - fracpart[i]) < 1e-4
def test_bitonic_argsort(ctx_factory, size, dtype): import sys is_pypy = '__pypy__' in sys.builtin_module_names if not size and is_pypy: # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array pytest.xfail("pypy doesn't seem to handle as_strided " "on zero-sized arrays very well") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Portable Computing Language" and sys.platform == "darwin"): pytest.xfail("Bitonic sort crashes on Apple POCL") if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms, evt = sorterm(m.copy(), idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def test(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) gpu_func = getattr(clmath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) dev = context.devices[0] if has_double_support(dev): if use_complex and has_struct_arg_count_bug(dev) == "apple": dtypes = [np.float32, np.float64, np.complex64] elif use_complex: dtypes = [np.float32, np.float64, np.complex64, np.complex128] else: dtypes = [np.float32, np.float64] else: if use_complex: dtypes = [np.float32, np.complex64] else: dtypes = [np.float32] for s in sizes: for dtype in dtypes: dtype = np.dtype(dtype) args = cl_array.arange(queue, a, b, (b - a) / s, dtype=dtype) if dtype.kind == "c": # args = args + dtype.type(1j) * args args = args + args * dtype.type(1j) gpu_results = gpu_func(args).get() cpu_results = cpu_func(args.get()) my_threshold = threshold if dtype.kind == "c" and isinstance(use_complex, float): my_threshold = use_complex max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= my_threshold).all(), \ (max_err, name, dtype)
def test(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) gpu_func = getattr(clmath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] for s in sizes: for dtype in dtypes: args = cl_array.arange(queue, a, b, (b-a)/s, dtype=np.float32) gpu_results = gpu_func(args).get() cpu_results = cpu_func(args.get()) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), \ (max_err, name, dtype)
import pyopencl.array as cl_array import numpy import numpy.linalg as la import scipy as sp from pyopencl.reduction import ReductionKernel from pyopencl.elementwise import ElementwiseKernel # Make shure we use the GPU for computations. platform=cl.get_platforms() gpu_devices=platform[0].get_devices(device_type=cl.device_type.GPU) ctx=cl.Context(devices=gpu_devices) queue = cl.CommandQueue(ctx) # Integration, one integral, using reduce length=0.001 vals=cl_array.arange(queue,0,100,length,dtype=numpy.double32) # Kernel for reduce-code krnlRed=ReductionKernel(ctx,numpy.double32,neutral="0", reduce_expr="a+b",map_expr="get_val(x[i])*%10.3f" % length, arguments="__global double *x", preamble=""" double get_val(double x) { return x*x; } """) # Generation of an array where each element is an evaluated integral. tonum=1000000 # Number of elements. # Array to send to the GPU. p_gpu=cl_array.to_device(ctx,queue,sp.linspace(0,tonum,tonum+1).astype(numpy.double32))
# GPU: Define Segmented Scan Kernel, scanning simulations prefix_sum = GenericScanKernel( ctx, np.float32, arguments="__global float *ary, __global char *segflags, " "__global float *out, float rho, float mu", input_expr="segflags[i] ? (ary[i]+mu):(ary[i]+(1-rho)*mu)", scan_expr="across_seg_boundary ? (b):(rho*a+b)", neutral="0", is_segment_start_expr="segflags[i]", output_statement="out[i] =(item>0) ? (0):(1)", options=[]) #set memory for results dev_result = cl_array.arange(queue, len(ran), dtype=np.float32, allocator=mem_pool) #set parameters and call minimize function x0 = np.zeros(1) x0[0] = 0.1 #initial set-up of rho xmin = -0.95 xmax = 0.95 rhomini = minimize(mini_parallel, x0, method='COBYLA', bounds=((xmin, xmax), ), options={'rhobeg': 0.01}) #report the results after minimization final_time = time.time()
def sim_health_index(n_runs): # Set up OpenCL context and command queue ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mem_pool = cltools.MemoryPool(cltools.ImmediateAllocator(queue)) t0 = time.time() rho = 0.5 mu = 3.0 sigma = 1.0 z_0 = mu # Generate an array of Normal Random Numbers on GPU of length n_sims*n_steps n_steps = int(4160) #4160 rand_gen = clrand.PhiloxGenerator(ctx) ran = rand_gen.normal(queue, (n_runs * n_steps), np.float32, mu=0, sigma=1.0) # Establish boundaries for each simulated walk (i.e. start and end) # Necessary so that we perform scan only within rand walks and not between seg_boundaries = [1] + [0] * (n_steps - 1) seg_boundaries = np.array(seg_boundaries, dtype=np.uint8) seg_boundary_flags = np.tile(seg_boundaries, int(n_runs)) seg_boundary_flags = cl_array.to_device(queue, seg_boundary_flags) # GPU: Define Segmented Scan Kernel, scanning simulations: rho*f(n-1)+(1-rho)*mu+ran # also output whether the value is smaller than 0 or not prefix_sum = GenericScanKernel( ctx, np.float32, arguments="__global float *ary, __global char *segflags, " "__global float *out, float rho, float mu", input_expr="segflags[i] ? (ary[i]+mu):(ary[i]+(1-rho)*mu)", scan_expr="across_seg_boundary ? (b):(rho*a+b)", neutral="0", is_segment_start_expr="segflags[i]", output_statement="out[i] =(item>0) ? (0):(1)", options=[]) dev_result = cl_array.arange(queue, len(ran), dtype=np.float32, allocator=mem_pool) # print time of GPU simulation #sim_time = time.time() #time_elapsed = sim_time - t0 #print("Simulated %d Health Index in: %f seconds"% (n_runs, time_elapsed)) # Iterate For 200 rho values rho_set = np.linspace(-0.95, 0.95, 200) rho_avgt_t = [] for rho in rho_set: #Enqueue and Run Scan Kernel #print(rho) prefix_sum(ran, seg_boundary_flags, dev_result, rho, mu) # Get results back on CPU to plot and do final calcs, just as in Lab 1 health_index_all = (dev_result.get().reshape(n_runs, n_steps)) # Find and averaged the index of first negative values across simulations t_all = [] for s in health_index_all: if 1 in s: s = list(s) t = s.index(1) else: t = n_steps t_all.append(t) #print(len(t_all)) avg_t = sum(t_all) / len(t_all) rho_avgt_t.append(avg_t) final_time = time.time() time_elapsed = final_time - t0 print("Simulated %d Health Index for 200 rho values in: %f seconds" % (n_runs, time_elapsed)) plt.plot(rho_set, rho_avgt_t) plt.title('Averaged periods of first negative index across Rho') plt.xlabel('Rho') plt.ylabel('Avged Period of first negative index') plt.savefig("GPU_rho_avgt_nruns%d.png" % (n_runs)) max_period = max(rho_avgt_t) max_rho = rho_set[rho_avgt_t.index(max_period)] print("Max Period: %f; Max Rho: %f." % (max_period, max_rho)) return
def test_speed(file_name , ctx_str): os.environ["PYOPENCL_CTX"] = ctx_str with open(file_name, "w") as myfile: myfile.write("RK order , space step , number of time steps , number of points, total time\n") # setup stuff ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags for h in hs: for order in orders: # meshes sizes hx = h hy = h hz = h # center locations xCenters = np.arange( hx/2, a + hx/2 , hx) nxCenters = len(xCenters) yCenters = np.arange( hy/2, b + hy/2 , hy) nyCenters = len(yCenters) zCenters = np.arange( hy/2, c + hy/2 , hy) nzCenters = len(zCenters) # grids, to get the CFL #X, Y , Z= np.meshgrid(xCenters , yCenters , zCenters) #U = PIB * psi * np.cos( Y * PIB ) * ( P * np.exp(A*X) + (1-P) * np.exp(B*X) -1 )/D; #V = - psi * np.sin( Y * PIB ) * ( A*P * np.exp(A*X) + B * (1-P) * np.exp(B*X) )/D; #W = np.ones(U.shape) cfl = 3 #np.max(np.abs(U)) + np.max(np.abs(V)) + 1 # time step ht = hx/(factor*cfl) # define the initial distribution of T #T = np.exp( - ( # ((X-mu[0])/sig[0])**2 + # ((Y-mu[1])/sig[1])**2 + # ((Z-mu[2])/sig[2])**2 # )/2.0 # ) T = np.ones( (nyCenters, nxCenters ,nzCenters), dtype = np.float32) # T.astype(np.float32) # cast to float32 so it works with kernel # Do string manipulations under the hood prg_str = getstring.get3d(order , hx, hy, hz, ht, nxCenters, nyCenters, nzCenters) # compile prg = cl.Program(ctx, prg_str).build() # create memory pools in_pool = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)) Tin_d = cl_array.arange(queue, nxCenters*nyCenters*nzCenters, dtype=np.float32, allocator=in_pool) out_pool = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)) Tout_d = cl_array.arange(queue, nxCenters*nyCenters*nzCenters, dtype=np.float32, allocator=out_pool) # start the timing: start = time.time() # do time stepping and plotting. for i in range(nSpeed): # Copy T into Tin_d Tin_d.set( T.ravel(), queue=queue ) # Apply the kernel here prg.rk_step(queue, T.shape, None, Tin_d.data, Tout_d.data) # Copy data into T Tout_d.get(queue=queue , ary=T) # End timing for this round end = time.time() # RK order , space step , number of time steps , number of points, total time\n") data = str(order) + " , " + str(h) + " , " + str(nSpeed) + " , " + str(T.shape[0]*T.shape[1]*T.shape[2]) + " , " + str(end-start) + "\n" with open(file_name, "a") as myfile: myfile.write(data)
def make_pix(rk_ord , set_up, target, h, final): os.environ["PYOPENCL_CTX"] = set_up # RK order 1 = Euler order = int(rk_ord) # meshes sizes hx = float(h) hy = float(h) # center locations xCenters = np.arange( hx/2, a + hx/2 , hx) nxCenters = len(xCenters) yCenters = np.arange( hy/2, b + hy/2 , hy) nyCenters = len(yCenters) # grids X, Y = np.meshgrid(xCenters , yCenters ) U = PIB * psi * np.cos( Y * PIB ) * ( P * np.exp(A*X) + (1-P) * np.exp(B*X) -1 )/D; V = - psi * np.sin( Y * PIB ) * ( A*P * np.exp(A*X) + B * (1-P) * np.exp(B*X) )/D; cfl = np.max(np.abs(U)) + np.max(np.abs(V)) # times ht = hx/(factor*cfl) # time step nt = int(int(final)/ht) # number of time steps # define the initial distribution of T T = np.exp( - ( ((X-mu[0])/sig[0])**4 + ((Y-mu[1])/sig[1])**4 )/2.0 ) T = T.astype(np.float32) # cast to float32 so it works with kernel # setup stuff ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # Get the kernel from a string prg_str = getstring.get2d(order , hx, hy, ht, nxCenters, nyCenters) # compile prg = cl.Program(ctx, prg_str).build() # create memory pools in_pool = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)) Tin_d = cl_array.arange(queue, nxCenters*nyCenters, dtype=np.float32, allocator=in_pool) out_pool = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)) Tout_d = cl_array.arange(queue, nxCenters*nyCenters, dtype=np.float32, allocator=out_pool) # do time stepping and plotting. for i in range(nt): if (nt % 20 == 0): # plot, not very interesting fig = plt.figure() CS = plt.contour(X, Y, T, levels=levels) plt.clabel(CS) plt.title("Tracer concentration. Spatial FEM and RK" + str(order) +" Time Steps.\n Step " +str(i) + ",T = " + str(i*ht)+ " weeks.") if hasattr(plt, "streamplot"): plt.streamplot(X, Y, U, V, color=U, linewidth=2, cmap=plt.cm.autumn) fig.savefig(target + '/frame' + str(i) + '.png') plt.close(fig) # Copy T into Tin_d Tin_d.set( T.ravel(), queue=queue ) # Apply the kernel here prg.rk_step(queue, T.shape, None, Tin_d.data, Tout_d.data) # Copy data into T Tout_d.get(queue=queue , ary=T)
import pyopencl.array as cl_array import numpy import numpy.linalg as la import scipy as sp from pyopencl.reduction import ReductionKernel from pyopencl.elementwise import ElementwiseKernel # Make shure we use the GPU for computations. platform=cl.get_platforms() gpu_devices=platform[0].get_devices(device_type=cl.device_type.GPU) ctx=cl.Context(devices=gpu_devices) queue = cl.CommandQueue(ctx) # Integration, one integral, using reduce length=0.001 vals=cl_array.arange(queue,0,100,length,dtype=numpy.float32) # Kernel for reduce-code krnlRed=ReductionKernel(ctx,numpy.float32,neutral="0", reduce_expr="a+b",map_expr="get_val(x[i])*%10.3f" % length, arguments="__global float *x", preamble=""" float get_val(float x) { return x*x; } """) # Generation of an array where each element is an evaluated integral. tonum=1000000 # Number of elements. # Array to send to the GPU. p_gpu=cl_array.to_device(ctx,queue,sp.linspace(0,tonum,tonum+1).astype(numpy.float32))
def get_arng(size, dtype=np.int32): return clarray.arange(queue, 0, size, 1, dtype=dtype)