def maximum(ary, backend=None): if backend is None: backend = ary.backend if backend == 'cython': return ary.dev.max() elif backend == 'opencl': import pyopencl.array as gpuarray return gpuarray.max(ary.dev).get() elif backend == 'cuda': import pycuda.gpuarray as gpuarray return gpuarray.max(ary.dev).get()
def get_divergence_error(vector): for mu in range(3): fft.idft(vector[mu], vector_x[mu]) derivs.divergence(queue, vector_x, div) derivs(queue, fx=vector_x[0], pdx=pdx[0]) derivs(queue, fx=vector_x[1], pdy=pdx[1]) derivs(queue, fx=vector_x[2], pdz=pdx[2]) norm = sum([clm.fabs(pdx[mu]) for mu in range(3)]) max_err = cla.max(clm.fabs(div)) / cla.max(norm) avg_err = cla.sum(clm.fabs(div)) / cla.sum(norm) return max_err, avg_err
def maximum_cl(a, b=None): """ Maximum values of two GPUArrays. Parameters ---------- a : gpuarray First GPUArray. b : gpuarray Second GPUArray. Returns ------- gpuarray Maximum values from both GPArrays, or single value if one GPUarray. Examples -------- >>> a = maximum_cl(give_cl(queue, [1, 2, 3]), give_cl(queue, [3, 2, 1])) [3, 2, 3] >>> type(a) <class 'pyopencl.array.Array'> """ if b is not None: return cl_array.maximum(a, b) return cl_array.max(a)
def max(t: Tensor) -> np.float32: """The maximum of the values in a tensor.""" if t.gpu: return clarray.max(t._data).get().flat[0] return np.max(t._data)
def get_divergence_errors(hij): max_errors = [] avg_errors = [] for i in range(1, 4): for mu in range(3): fft.idft(hij[tensor_id(i, mu + 1)], vector_x[mu]) derivs.divergence(queue, vector_x, div) derivs(queue, fx=vector_x[0], pdx=pdx[0]) derivs(queue, fx=vector_x[1], pdy=pdx[1]) derivs(queue, fx=vector_x[2], pdz=pdx[2]) norm = sum([clm.fabs(pdx[mu]) for mu in range(3)]) max_errors.append(cla.max(clm.fabs(div)) / cla.max(norm)) avg_errors.append(cla.sum(clm.fabs(div)) / cla.sum(norm)) return np.array(max_errors), np.array(avg_errors)
def max(*args, **kwargs): a = args[0] if a.ndim==0 or not 'axis' in kwargs.keys(): res = clarray.max(a, queue=queue) #np.sum(*args, **kwargs) if not isinstance(res, myclArray): res.__class__ = myclArray res.reinit() return res else: kwargs['prg2load'] = programs.max return _sum(*args, **kwargs)
def _get_min_max(self): # as amax is too slow for bug arrays, do it on the gpu if self.dataModel: try: im = self.renderer.dataImg tmp_buf = OCLArray.empty(im.shape, im.dtype) tmp_buf.copy_image(im) mi = float(cl_array.min(tmp_buf).get()) ma = float(cl_array.max(tmp_buf).get()) except Exception as e: print(e) mi = np.amin(self.dataModel[0]) ma = np.amax(self.dataModel[0]) return mi, ma
def max(q, a, axis=None, keepdims=False): assert a.ndim < 3 if axis is None or a.ndim <= 1: out_shape = (1, ) * a.ndim return clarray.max(a).reshape(out_shape) elif axis < 0: axis += 2 assert axis in (0, 1) # TODO generate & cache kernel elsewhere prg = cl.Program( clplatf.ctx, _cmp_by_axis_kernel_template % { 'cmp_op': '>', 'dtype': dtype_to_ctype(a.dtype), 'init_val': str(np.finfo(a.dtype).min) }).build() col_max = prg._max_by_cols row_max = prg._max_by_rows # TODO calculate workgroup size given the array and axis element_size = 4 if a.dtype == np.float32 else 8 n, m = a.shape if a.flags.c_contiguous else (a.shape[1], a.shape[0]) if (axis == 0 and a.flags.c_contiguous) or (axis == 1 and a.flags.f_contiguous): if keepdims: out_shape = (1, m) if axis == 0 else (m, 1) else: out_shape = (m, ) maxes = clarray.empty(q, out_shape, dtype=a.dtype) indices = clarray.empty(q, out_shape, dtype=np.int32) ev = col_max(q, (m * 64, ), (64, ), a.data, maxes.data, indices.data, np.int32(m), np.int32(n), cl.LocalMemory(64 * element_size), cl.LocalMemory(64 * 4)) else: if keepdims: out_shape = (1, n) if axis == 0 else (n, 1) else: out_shape = (n, ) maxes = clarray.empty(q, out_shape, dtype=a.dtype) indices = clarray.empty(q, out_shape, dtype=np.int32) ev = row_max(q, (n * 64, ), (64, ), a.data, maxes.data, indices.data, np.int32(m), np.int32(n), cl.LocalMemory(64 * element_size), cl.LocalMemory(64 * 4)) if ev is not None: ev.wait() return maxes, indices
def minZerrKernSHG_gpu(self): krn = self.progs.progs["minZerrSHG"].minZerrSHG krn.set_scalar_arg_dtypes((None, None, None, None, None, None, None, None, np.int32)) krn.set_args( self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.dZ_cla.data, self.X0_cla.data, self.X1_cla.data, self.X2_cla.data, self.X3_cla.data, self.X4_cla.data, self.N, ) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None) ev.wait() krn = self.progs.progs["normEsig"].normEsig krn.set_scalar_arg_dtypes((None, None, np.int32)) krn.set_args(self.Esig_t_tau_p_cla.data, self.Esig_t_tau_norm_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Esig_t_tau_p.shape, None) ev.wait() mx = cla.max(self.Esig_t_tau_norm_cla).get() * self.N * self.N # Esig_t_tau = self.Esig_t_tau_p_cla.get() # mx = ((Esig_t_tau*Esig_t_tau.conj()).real).max() * self.N*self.N X0 = cla.sum(self.X0_cla, queue=self.q).get() / mx X1 = cla.sum(self.X1_cla, queue=self.q).get() / mx X2 = cla.sum(self.X2_cla, queue=self.q).get() / mx X3 = cla.sum(self.X3_cla, queue=self.q).get() / mx X4 = cla.sum(self.X4_cla, queue=self.q).get() / mx root.debug("".join(("X0=", str(X0), ", type ", str(type(X0))))) root.debug( "".join(("Poly: ", str(X4), " x^4 + ", str(X3), " x^3 + ", str(X2), " x^2 + ", str(X1), " x + ", str(X0))) ) # Polynomial in dZ (expansion of differential) X = np.array([X0, X1, X2, X3, X4]).astype(np.double) root.debug("".join(("Esig_t_tau_p norm max: ", str(mx / (self.N * self.N))))) return X
def fixup_floor(params, fflag, G, P): s = G.slices sh = G.shapes # Then apply floors: # 1. Precalculate geometric hard floors, not based on fluid relationships global rhoflr_geom, uflr_geom, U, D, Padd, Uadd, Dadd, dzero if rhoflr_geom is None: if "mks" in params['coordinates']: # New, steeper floor in rho # Previously raw r^-2 or r^-1.5 r = G.coords.r(G.coord_all()) rhoscal = 1/(r**2) * 1 / (1 + r/params['floor_char_r']) # Impose minimum rho with scaling above, minimum u as rho**gam rhoflr_geom = cl_array.to_device(params['queue'], np.maximum(params['rho_min'] * rhoscal, params['rho_min_limit'])) uflr_geom = cl_array.to_device(params['queue'], np.maximum(params['u_min']*(rhoscal**params['gam']), params['u_min_limit'])) elif "minkowski" in params['coordinates']: rhoflr_geom = cl_array.empty(params['queue'], sh.grid_scalar, dtype=np.float64).fill(params['rho_min']*1.e-2) uflr_geom = cl_array.empty(params['queue'], sh.grid_scalar, dtype=np.float64).fill(params['u_min']*1.e-2) # Arrays we should keep on hand: derived values and values for additional fluid packet U = cl_array.zeros_like(P) D = get_state(params, G, P, Loci.CENT) Padd = cl_array.zeros_like(P) Uadd = cl_array.zeros_like(P) Dadd = get_state(params, G, P, Loci.CENT) dzero = cl_array.zeros_like(P) global knl_floors if knl_floors is None: code = add_ghosts(replace_prim_names(""" # 2. Magnetic floors: impose maximum magnetization sigma = bsq/rho, inverse beta prop. to bsq/U rhoflr_b := bsq[i,j,k] / sig_max uflr_b := bsq[i,j,k] / (sig_max * temp_max) # Maximum U floor uflr_max := if(uflr_b > uflr_geom[i,j,k], uflr_b, uflr_geom[i,j,k]) # 3. Temperature ceiling: impose maximum temperature temp_real := P[UU,i,j,k] / temp_max temp_floor := uflr_max / temp_max rhoflr_temp := if(temp_real > temp_floor, temp_real, temp_floor) # Maximum rho floor rhoflr_max1 := if(rhoflr_geom[i,j,k] > rhoflr_b, rhoflr_geom[i,j,k], rhoflr_b) rhoflr_max := if(rhoflr_max1 > rhoflr_temp, rhoflr_max1, rhoflr_temp) # Initialize a dummy fluid parcel with any missing mass and internal energy, but not velocity rho_add := rhoflr_max - P[RHO,i,j,k] u_add := uflr_max - P[UU,i,j,k] Padd[RHO,i,j,k] = if(rho_add > 0, rho_add, 0) {id=p1,nosync=p2} Padd[UU,i,j,k] = if(u_add > 0, u_add, 0) {id=p2,nosync=p1} """)) knl_floors = lp.make_kernel(sh.isl_grid_scalar, code, [*primsArrayArgs("P", "Padd"), *scalarArrayArgs("bsq", "rhoflr_geom", "uflr_geom"), ...]) knl_floors = lp.fix_parameters(knl_floors, sig_max=params['sigma_max'], temp_max=params['u_over_rho_max'], nprim=params['n_prim']) knl_floors = tune_grid_kernel(knl_floors, shape=sh.bulk_scalar, ng=G.NG, prefetch_args=["bsq", "rhoflr_geom", "uflr_geom"]) print("Compiled fixup_floors") # Bulk call before bsq calculation below get_state(params, G, P, Loci.CENT, out=D) # Reused below! bsq = G.dot(D['bcon'], D['bcov']) Padd.fill(0) evt, _ = knl_floors(params['queue'], P=P, bsq=bsq, rhoflr_geom=rhoflr_geom, uflr_geom=uflr_geom, Padd=Padd) evt.wait() if params['debug']: rhits = np.count_nonzero(Padd[s.RHO].get()) uhits = np.count_nonzero(Padd[s.UU].get()) print("Rho floors hit {} times".format(rhits)) print("U floors hit {} times".format(uhits)) if cl_array.max(Padd) > 0.: print("Fixing up") # Get conserved variables for the parcel get_state(params, G, Padd, Loci.CENT, out=Dadd) prim_to_flux(params, G, Padd, Dadd, 0, Loci.CENT, out=Uadd) # And for the current state #get_state(G, S, i, j, k, CENT, out=D) # Called just above prim_to_flux(params, G, P, D, 0, Loci.CENT, out=U) # Recover primitive variables. Don't touch what we don't have to Ptmp, pflag = U_to_P(params, G, U+Uadd, P+Padd) P = cl_array.if_positive(Padd, Ptmp, P) del Ptmp # TODO Record specific floor hits/U_to_P fails #fflag |= cl_array.if_positive(rhoflr_geom - P[s.RHO], temp.fill(HIT_FLOOR_GEOM_RHO), zero) #fflag |= cl_array.if_positive(uflr_geom - P[s.UU], temp.fill(HIT_FLOOR_GEOM_U), zero) #fflag |= cl_array.if_positive(rhoflr_b - P[s.RHO], temp.fill(HIT_FLOOR_B_RHO), zero) #fflag |= cl_array.if_positive(uflr_b - P[s.UU], temp.fill(HIT_FLOOR_B_U), zero) #fflag |= cl_array.if_positive(rhoflr_temp - P[s.RHO], temp.fill(HIT_FLOOR_TEMP), zero) #fflag |= cl_array.if_positive(pflag, temp.fill(FLOOR_UTOP_FAIL), zero) if params['electrons']: # Reset the entropy after floor application P[s.KTOT] = (params['gam'] - 1.) * P[s.UU] / (P[s.RHO]**params['gam']) return P, fflag
def test_spectral_poisson(ctx_factory, grid_shape, proc_shape, h, dtype, timing=False): if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape) rank_shape, _ = mpi.get_rank_shape_start(grid_shape) fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype) L = (3, 5, 7) dx = tuple(Li / Ni for Li, Ni in zip(L, grid_shape)) dk = tuple(2 * np.pi / Li for Li in L) if h == 0: def get_evals_2(k, dx): return - k**2 derivs = ps.SpectralCollocator(fft, dk) else: from pystella.derivs import SecondCenteredDifference get_evals_2 = SecondCenteredDifference(h).get_eigenvalues derivs = ps.FiniteDifferencer(mpi, h, dx, stream=False) solver = ps.SpectralPoissonSolver(fft, dk, dx, get_evals_2) pencil_shape = tuple(ni + 2*h for ni in rank_shape) statistics = ps.FieldStatistics(mpi, 0, rank_shape=rank_shape, grid_size=np.product(grid_shape)) fx = cla.empty(queue, pencil_shape, dtype) rho = clr.rand(queue, rank_shape, dtype) rho -= statistics(rho)["mean"] lap = cla.empty(queue, rank_shape, dtype) rho_h = rho.get() for m_squared in (0, 1.2, 19.2): solver(queue, fx, rho, m_squared=m_squared) fx_h = fx.get() if h > 0: fx_h = fx_h[h:-h, h:-h, h:-h] derivs(queue, fx=fx, lap=lap) diff = np.fabs(lap.get() - rho_h - m_squared * fx_h) max_err = np.max(diff) / cla.max(clm.fabs(rho)) avg_err = np.sum(diff) / cla.sum(clm.fabs(rho)) max_rtol = 1e-12 if dtype == np.float64 else 1e-4 avg_rtol = 1e-13 if dtype == np.float64 else 1e-5 assert max_err < max_rtol and avg_err < avg_rtol, \ f"solution inaccurate for {h=}, {grid_shape=}, {proc_shape=}" if timing: from common import timer time = timer(lambda: solver(queue, fx, rho, m_squared=m_squared), ntime=10) if mpi.rank == 0: print(f"poisson took {time:.3f} ms for {grid_shape=}, {proc_shape=}")
def test_step(ctx_factory, proc_shape, dtype, Stepper): if proc_shape != (1, 1, 1): pytest.skip("test step only on one rank") if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) from pystella.step import LowStorageRKStepper is_low_storage = LowStorageRKStepper in Stepper.__bases__ rank_shape = (1, 1, 8) init_vals = np.linspace(1, 3, 8) if is_low_storage: y = cla.zeros(queue, rank_shape, dtype) y[0, 0, :] = init_vals y0 = y.copy() else: num_copies = Stepper.num_copies y = cla.zeros(queue, (num_copies, ) + rank_shape, dtype) y[0, 0, 0, :] = init_vals y0 = y[0].copy() dtlist = [.1, .05, .025] for n in [-1., -2., -3., -4.]: max_errs = {} for dt in dtlist: def sol(y0, t): return ((-1 + n) * (-t + y0**(1 - n) / (-1 + n)))**(1 / (1 - n)) _y = ps.Field("y") rhs_dict = {_y: _y**n} stepper = Stepper(rhs_dict, dt=dt, halo_shape=0, rank_shape=rank_shape) if is_low_storage: y[0, 0, :] = init_vals else: y[0, 0, 0, :] = init_vals t = 0 errs = [] while t < .1: for s in range(stepper.num_stages): stepper(s, queue=queue, y=y, filter_args=True) t += dt if is_low_storage: errs.append(cla.max(clm.fabs(1. - sol(y0, t) / y)).get()) else: errs.append( cla.max(clm.fabs(1. - sol(y0, t) / y[0])).get()) max_errs[dt] = np.max(errs) order = stepper.expected_order print(f"{order=}, {n=}") print(max_errs) print([ max_errs[a] / max_errs[b] for a, b in zip(dtlist[:-1], dtlist[1:]) ]) order = stepper.expected_order rtol = dtlist[-1]**order if dtype == np.float64 else 1e-1 assert list(max_errs.values())[-1] < rtol, \ f"Stepper solution inaccurate for {n=}" for a, b in zip(dtlist[:-1], dtlist[1:]): assert max_errs[a] / max_errs[b] > .9 * (a/b)**order, \ f"Stepper convergence failing for {n=}"
def _im_convert(norm='local'): """ Image conversion from np array to hsv PIL """ assert norm in ('global', 'local') if gpu_info != None: context, device, queue, platform = gpu_info import gpu, string import pyopencl.array as cla # build the additional kernels hsv_convert = build('complex_to_rgb.cl') cl_abs = build('common_abs_f2_f.cl') # allocate new memory (frames, rows, columns, 3); uchar -> uint8? new_shape = gpu_store.shape+(3,) image_buffer = cla.empty(queue, new_shape, np.uint8) debug_buffer = cla.empty(queue, gpu_store.shape, np.float32) # each frame must be normalized by SOMETHING. if norm=='local', # that something is the max value of the abs of each frame. if # norm == 'global', that something is the max value of the entire # array. in either case, we make a list of maxvals which will # correspond to each frame so that we can use the same hsv # conversion kernel. if norm == 'global': # calculate the max for the entire array abs_store = cla.empty(queue, gpu_store.shape, np.float32) mv_gpu = cla.empty(queue, (gpu_store.shape[0],), np.float32) cl_abs.execute(queue, (gpu_store.size,), None, gpu_store.data, abs_store.data) mv_gpu.fill(np.float32(cla.max(abs_store).get())) if norm == 'local': # calculate the max for one frame at a time. pull this value # and save it in the max_vals list. when finished, move # max_vals to the gpu. nz = len(distances) max_vals = np.zeros(nz, np.float32) slice_frame = build('common_slice_frame_f2.cl') slice_store = cla.empty(queue, (rows, cols), np.complex64) abs_store = cla.empty(queue, (rows, cols), np.float32) for n in xrange(nz): slice_frame.execute(queue, (rows, cols), None, slice_store.data, gpu_store.data, np.int32(n)) cl_abs.execute(queue, (slice_store.size,), None, slice_store.data, abs_store.data) max_vals[n] = np.float32(cla.max(abs_store).get()) mv_gpu = cla.to_device(queue, max_vals.astype(np.float32)) # run the complex -> hsv -> rgb converter on each pixel hsv_convert.execute(queue, gpu_store.shape, None, gpu_store.data, image_buffer.data, mv_gpu.data).wait() # now convert to pil objects import scipy.misc as smp image_buffer = image_buffer.get() print image_buffer.shape images = [smp.toimage(i) for i in image_buffer] return gpu_store.get(), images if gpu_info == None: # now convert frames to pil objects import scipy.misc as smp import io images = [smp.toimage(io.complex_hsv_image(x)) for x in store] return store, images
def _max_OCL(a, queue=None): return cl_array.max(a=a, queue=queue)
def max(self, a): import pyopencl.array as cl_array return cl_array.max(a, queue=self._array_context.queue).get()[()]