def test_nan_arithmetic(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) def make_nan_contaminated_vector(size): shape = (size,) a = numpy.random.randn(*shape).astype(numpy.float32) #for i in range(0, shape[0], 3): #a[i] = float('nan') from random import randrange for i in range(size//10): a[randrange(0, size)] = float('nan') return a size = 1 << 20 a = make_nan_contaminated_vector(size) a_gpu = cl_array.to_device(context, queue, a) b = make_nan_contaminated_vector(size) b_gpu = cl_array.to_device(context, queue, b) ab = a*b ab_gpu = (a_gpu*b_gpu).get() for i in range(size): assert numpy.isnan(ab[i]) == numpy.isnan(ab_gpu[i])
def _make_inputs(self, queue, pixel_size): mf = cl.mem_flags v_1 = cl_array.to_device(queue, self._make_vertices(0, pixel_size[1])) v_2 = cl_array.to_device(queue, self._make_vertices(1, pixel_size[0])) v_3 = cl_array.to_device(queue, self._make_vertices(2, pixel_size[1])) return v_1, v_2, v_3
def build(self, coords, values, base): """Use OpenCL to build the arrays.""" lenbase = base.shape[0] lencoords = coords.shape[0] coords_array = cla.to_device(self.queue, coords) values_array = cla.to_device(self.queue, values) base_array = cla.to_device(self.queue, base) template_array = cla.zeros(self.queue, (lenbase), dtype=np.int32) event = self.program.nearest( self.queue, base.shape, None, coords_array.data, values_array.data, base_array.data, template_array.data, np.int32(lencoords), self.nnear, self.usemajority, ) try: event.wait() except cl.RuntimeError, inst: errstr = inst.__str__() if errstr == "clWaitForEvents failed: out of resources": print "OpenCL timed out, probably due to the display manager." print "Disable your display manager and try again!" print "If that does not work, rerun with OpenCL disabled." else: raise cl.RuntimeError, inst sys.exit(1)
def compute_preconditioners(self): """ Create a diagonal preconditioner for the projection and backprojection operator. Each term of the diagonal is the sum of the projector/backprojector along rows [1], i.e the projection/backprojection of an array of ones. [1] Jens Gregor and Thomas Benson, Computational Analysis and Improvement of SIRT, IEEE transactions on medical imaging, vol. 27, no. 7, 2008 """ # r_{i,i} = 1/(sum_j a_{i,j}) slice_ones = np.ones(self.backprojector.slice_shape, dtype=np.float32) R = 1./self.projector.projection(slice_ones) # could be all done on GPU, but I want extra checks R[np.logical_not(np.isfinite(R))] = 1. # In the case where the rotation axis is excentred self.d_R = parray.to_device(self.queue, R) # c_{j,j} = 1/(sum_i a_{i,j}) sino_ones = np.ones(self.sino_shape, dtype=np.float32) C = 1./self.backprojector.backprojection(sino_ones) C[np.logical_not(np.isfinite(C))] = 1. # In the case where the rotation axis is excentred self.d_C = parray.to_device(self.queue, C) self.add_to_cl_mem({ "d_R": self.d_R, "d_C": self.d_C })
def computeEnergy(self, x, y, z, q): xd = cl_array.to_device(self.queue, x) yd = cl_array.to_device(self.queue, y) zd = cl_array.to_device(self.queue, z) qd = cl_array.to_device(self.queue, q) coulombEnergy = cl_array.zeros_like(xd) prec = x.dtype if prec == numpy.float32: self.compEnergyF.calc_potential_energy(self.queue, (x.size, ), None, xd.data, yd.data, zd.data, qd.data, coulombEnergy.data, numpy.int32(len(x)), numpy.float32(self.k),numpy.float32(self.impactFact), g_times_l = False) elif prec == numpy.float64: self.compEnergyD.calc_potential_energy(self.queue, (x.size, ), None, xd.data, yd.data, zd.data, qd.data, coulombEnergy.data, numpy.int32(len(x)) , numpy.float64(self.k),numpy.float64(self.impactFact), g_times_l = False) else: print("Unknown float type.") return numpy.sum(coulombEnergy.get(self.queue))
def computeEnergy(self, x, y, z, q): coulombEnergy = cl_array.zero_like(q) xd = cl_array.to_device(self.queue, x) yd = cl_array.to_device(self.queue, y) zd = cl_array.to_device(self.queue, z) qd = cl_array.to_device(self.queue, q) prec = x.dtype if prec == numpy.float32: self.compEnergyF.calc_potential_energy( self.queue, (x.size, ), None, xd.data, yd.data, zd.data, qd.data, coulombEnergy.data, g_time_l=False) elif prec == numpy.float64: self.compEnergyD.calc_potential_energy( self.queue, (x.size, ), None, xd.data, yd.data, zd.data, qd.data, coulombEnergy.data, g_time_l=False) else: print("Unknown float type.") return np.sum(coulombEnergy.get(self.queue))
def test_count_1(self): nrepeats = 3 shape = [5, 5, 5] np_interspace = randint(2, size=shape).astype(np.int32) np_access_interspace = randint(nrepeats, size=shape).astype(np.int32) np_count = np.ones([nrepeats] + shape, dtype=np.float32) weight = 0.5 expected = np.ones_like(np_count) tmp = expected[0] tmp[np_interspace == 1] += weight for i in range(1, nrepeats): tmp = expected[i] tmp[np_access_interspace == i] += weight cl_interspace = cl_array.to_device(self.queue, np_interspace) cl_access_interspace = cl_array.to_device(self.queue, np_access_interspace) cl_count = cl_array.to_device(self.queue, np_count) self.kernels.count(self.queue, cl_interspace, cl_access_interspace, weight, cl_count) self.assertTrue(np.allclose(expected, cl_count.get()))
def test_fancy_indexing(ctx_factory): if _PYPY: pytest.xfail("numpypy: multi value setting is not supported") context = ctx_factory() queue = cl.CommandQueue(context) n = 2 ** 20 + 2**18 + 22 numpy_dest = np.zeros(n, dtype=np.int32) numpy_idx = np.arange(n, dtype=np.int32) np.random.shuffle(numpy_idx) numpy_src = 20000+np.arange(n, dtype=np.int32) cl_dest = cl_array.to_device(queue, numpy_dest) cl_idx = cl_array.to_device(queue, numpy_idx) cl_src = cl_array.to_device(queue, numpy_src) numpy_dest[numpy_idx] = numpy_src cl_dest[cl_idx] = cl_src assert np.array_equal(numpy_dest, cl_dest.get()) numpy_dest = numpy_src[numpy_idx] cl_dest = cl_src[cl_idx] assert np.array_equal(numpy_dest, cl_dest.get())
def compute_preconditioners(self): """ Create a diagonal preconditioner for the projection and backprojection operator. Each term of the diagonal is the sum of the projector/backprojector along rows [2], i.e the projection/backprojection of an array of ones. [2] T. Pock, A. Chambolle, Diagonal preconditioning for first order primal-dual algorithms in convex optimization, International Conference on Computer Vision, 2011 """ # Compute the diagonal preconditioner "Sigma" slice_ones = np.ones(self.backprojector.slice_shape, dtype=np.float32) Sigma_k = 1./self.projector.projection(slice_ones) Sigma_k[np.logical_not(np.isfinite(Sigma_k))] = 1. self.d_Sigma_k = parray.to_device(self.queue, Sigma_k) self.d_Sigma_kp1 = self.d_Sigma_k + 1 # TODO: memory vs computation self.Sigma_grad = 1/2.0 # For discrete gradient, sum|D_i,j| = 2 along lines or cols # Compute the diagonal preconditioner "Tau" sino_ones = np.ones(self.sino_shape, dtype=np.float32) C = self.backprojector.backprojection(sino_ones) Tau = 1./(C + 2.) self.d_Tau = parray.to_device(self.queue, Tau) self.add_to_cl_mem({ "d_Sigma_k": self.d_Sigma_k, "d_Sigma_kp1": self.d_Sigma_kp1, "d_Tau": self.d_Tau })
def get_binned_data_angular(self,limits=((-1,1),(-1,1)),points=500): """ Azimuth/elevation map measured ray endpoints to a circle and bin them on the CL DEV. This linearly maps elevation to the circle's radius and azimuth to phi. nice for cross-section plots of directivity. Binning is done with points number of points within limits=((xmin,xmax),(ymin,ymax)).""" (pos0,pwr0) = self.get_measured_rays() pos0_dev = cl_array.to_device(self.queue,pos0.astype(np.float32)) x_dev = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32) y_dev = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32) pwr0_dev = cl_array.to_device(self.queue,pwr0.astype(np.float32)) pwr_dev = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32) pivot = cl_array.to_device(self.queue,np.array([0,0,0,0],dtype=np.float32)) time1 = time() R_dev = cl_array.to_device(self.queue,np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,0]]).astype(np.float32)) evt = self.prg.angular_project(self.queue, pwr0.shape, None, pos0_dev.data,pwr0_dev.data,R_dev.data,pivot.data,x_dev.data,y_dev.data,pwr_dev.data) evt.wait() x=x_dev.get() y=y_dev.get() pwr=np.float64(pwr_dev.get()) time2 = time() dx = np.float64(limits[0][1]-limits[0][0])/np.float64(points) dy = np.float64(limits[1][1]-limits[1][0])/np.float64(points) pwr = pwr / (dx * dy) (H,x_coord,y_coord)=np.histogram2d(x=x.flatten(),y=y.flatten(),bins=points,range=limits,weights=pwr.flatten()) self.hist_data = (H,x_coord,y_coord) return self.hist_data
def __init__(self, target, queue, laplace=False): super(GPUCorrelator, self).__init__(target, laplace=laplace) self._queue = queue self._ctx = self._queue.context self._gpu = self._queue.device self._allocate_arrays() self._build_ffts() self._generate_kernels() target = self._target if self._laplace: target = self._laplace_filter(self._target) # move some arrays to the GPU self._gtarget = cl_array.to_device(self._queue, target.astype(np.float32)) self._lcc_mask = cl_array.to_device(self._queue, self._lcc_mask.astype(np.int32)) # Do some one-time precalculations self._rfftn(self._gtarget, self._ft_target) self._k.multiply(self._gtarget, self._gtarget, self._target2) self._rfftn(self._target2, self._ft_target2) self._gcenter = np.asarray(list(self._center) + [0], dtype=np.float32) self._gshape = np.asarray( list(self._target.shape) + [np.product(self._target.shape)], dtype=np.int32)
def get_binned_data_stereographic(self,limits=((-1,1),(-1,1)),points=500): #project data stereographically onto xy plane and bin it """ stereographically project measured ray endpoints and bin them on the CL DEV. This is a lot faster when you have loads of data. Binning is done with points number of points within limits=((xmin,xmax),(ymin,ymax)).""" (pos0,pwr0) = self.get_measured_rays() pos0_dev = cl_array.to_device(self.queue,pos0.astype(np.float32)) x_dev = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32) y_dev = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32) pwr0_dev = cl_array.to_device(self.queue,pwr0.astype(np.float32)) pwr_dev = cl_array.zeros(self.queue,pwr0.shape,dtype=np.float32) pivot = cl_array.to_device(self.queue,np.array([0,0,0,0],dtype=np.float32)) time1 = time() R_dev = cl_array.to_device(self.queue,np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,0]]).astype(np.float32)) evt = self.prg.stereograph_project(self.queue, pwr0.shape, None, pos0_dev.data,pwr0_dev.data,R_dev.data,pivot.data,x_dev.data,y_dev.data,pwr_dev.data) evt.wait() x=x_dev.get() y=y_dev.get() pwr=np.float64(pwr_dev.get()) time2 = time() dx = np.float64(limits[0][1]-limits[0][0])/np.float64(points) dy = np.float64(limits[1][1]-limits[1][0])/np.float64(points) pwr = pwr / (dx * dy) (H,x_coord,y_coord)=np.histogram2d(x=x.flatten(),y=y.flatten(),bins=points,range=limits,weights=pwr.flatten()) self.hist_data = (H,x_coord,y_coord) return self.hist_data
def __init__(self, ctx, queue, dtype=np.float32): self.ctx = ctx self.queue = queue sobel_c = np.array([1., 0., -1.]).astype(dtype) sobel_r = np.array([1., 2., 1.]).astype(dtype) self.sobel_c = cl_array.to_device(self.queue, sobel_c) self.sobel_r = cl_array.to_device(self.queue, sobel_r) self.scratch = None self.sepconv_rc = LocalMemorySeparableCorrelation(self.ctx, self.queue, sobel_r, sobel_c) self.sepconv_cr = LocalMemorySeparableCorrelation(self.ctx, self.queue, sobel_c, sobel_r) TYPE = "" if dtype == np.float32: TYPE = "float" elif dtype == np.uint8: TYPE = "unsigned char" elif dtype == np.uint16: TYPE = "unsigned short" self.mag = ElementwiseKernel(ctx, "float *result, %s *imgx, %s *imgy" % (TYPE, TYPE), "result[i] = sqrt((float)imgx[i]*imgx[i] + (float)imgy[i]*imgy[i])", "mag")
def allocate_arrays(self): """ Allocate various types of arrays for the tests """ # numpy images self.grad = np.zeros(self.image.shape, dtype=np.complex64) self.grad2 = np.zeros((2,) + self.image.shape, dtype=np.float32) self.grad_ref = gradient(self.image) self.div_ref = divergence(self.grad_ref) self.image2 = np.zeros_like(self.image) # Device images self.gradient_parray = parray.zeros(self.la.queue, self.image.shape, np.complex64) # we should be using cl.Buffer(self.la.ctx, cl.mem_flags.READ_WRITE, size=self.image.nbytes*2), # but platforms not suporting openCL 1.2 have a problem with enqueue_fill_buffer, # so we use the parray "fill" utility self.gradient_buffer = self.gradient_parray.data # Do the same for image self.image_parray = parray.to_device(self.la.queue, self.image) self.image_buffer = self.image_parray.data # Refs tmp = np.zeros(self.image.shape, dtype=np.complex64) tmp.real = np.copy(self.grad_ref[0]) tmp.imag = np.copy(self.grad_ref[1]) self.grad_ref_parray = parray.to_device(self.la.queue, tmp) self.grad_ref_buffer = self.grad_ref_parray.data
def test_pthomas(): nz = 3 ny = 4 nx = 5 a = np.random.rand(nx) b = np.random.rand(nx) c = np.random.rand(nx) d = np.random.rand(nz, ny, nx) d_copy = d.copy() solver = pthomas.PThomas(context, queue, (nz, ny, nx)) a_d = cl_array.to_device(queue, a) b_d = cl_array.to_device(queue, b) c_d = cl_array.to_device(queue, c) c2_d = cl_array.to_device(queue, c) d_d = cl_array.to_device(queue, d) evt = solver.solve(a_d, b_d, c_d, c2_d, d_d) d = d_d.get() for i in range(nz): for j in range(ny): x_true = scipy_solve_banded(a, b, c, d_copy[i,j,:]) assert_allclose(x_true, d[i,j,:]) print 'pass'
def CalcF(ctx, queue, m2, r2): # Define dimensions xdim = ydim = m2.shape[0] # m2 = np.float32(m2) # r2 = np.float32(r2) # Get the compiled kernel kernel = get_kernel(ctx, xdim) # Move data to the GPU gpu_m2 = cl_array.to_device(queue, m2) gpu_r2 = cl_array.to_device(queue, r2) gpu_result = cl_array.zeros(queue, (ydim, xdim), np.float32) # Define grid shape (the same as the matrix dimensions) grid_shape = (ydim, xdim) # Get group shape based on the matrix dimensions and the actual hardware group_shape = (16, 16) event = kernel.CalcF(queue, grid_shape, group_shape, gpu_result.data, gpu_m2.data, gpu_r2.data) event.wait() result = gpu_result.get() queue.finish() return result
def get_array(data, queue=None): """Get pyopencl.array.Array from *data* which can be a numpy array, a pyopencl.array.Array or a pyopencl.Image. *queue* is an OpenCL command queue. """ if not queue: queue = cfg.OPENCL.queue if isinstance(data, cl_array.Array): result = data elif isinstance(data, np.ndarray): if data.dtype.kind == 'c': if data.dtype.itemsize != cfg.PRECISION.cl_cplx: data = data.astype(cfg.PRECISION.np_cplx) result = cl_array.to_device(queue, data.astype(cfg.PRECISION.np_cplx)) else: if data.dtype.kind != 'f' or data.dtype.itemsize != cfg.PRECISION.cl_float: data = data.astype(cfg.PRECISION.np_float) result = cl_array.to_device(queue, data.astype(cfg.PRECISION.np_float)) elif isinstance(data, cl.Image): result = cl_array.empty(queue, data.shape[::-1], np.float32) cl.enqueue_copy(queue, result.data, data, offset=0, origin=(0, 0), region=result.shape[::-1]) if result.dtype.itemsize != cfg.PRECISION.cl_float: result = result.astype(cfg.PRECISION.np_float) else: raise TypeError('Unsupported data type {}'.format(type(data))) return result
def _gpu_init(self): """Method to initialize all the data for GPU-accelerate search""" self.gpu_data = {} g = self.gpu_data d = self.data q = self.queue # move data to the GPU. All should be float32, as these is the native # lenght for GPUs g['rcore'] = cl_array.to_device(q, float32array(d['rcore'].array)) g['rsurf'] = cl_array.to_device(q, float32array(d['rsurf'].array)) # Make the scanning chain object an Image, as this is faster to rotate g['im_lsurf'] = cl.image_from_array(q.context, float32array(d['lsurf'].array)) g['sampler'] = cl.Sampler(q.context, False, cl.addressing_mode.CLAMP, cl.filter_mode.LINEAR) if self.distance_restraints: g['restraints'] = cl_array.to_device(q, float32array(d['restraints'])) # Allocate arrays on the GPU g['lsurf'] = cl_array.zeros_like(g['rcore']) g['clashvol'] = cl_array.zeros_like(g['rcore']) g['intervol'] = cl_array.zeros_like(g['rcore']) g['interspace'] = cl_array.zeros(q, d['shape'], dtype=np.int32) g['restspace'] = cl_array.zeros_like(g['interspace']) g['access_interspace'] = cl_array.zeros_like(g['interspace']) g['best_access_interspace'] = cl_array.zeros_like(g['interspace']) # arrays for counting # Reductions are typically tedious on GPU, and we need to define the # workgroupsize to allocate the correct amount of data WORKGROUPSIZE = 32 nsubhists = int(np.ceil(g['rcore'].size/WORKGROUPSIZE)) g['subhists'] = cl_array.zeros(q, (nsubhists, d['nrestraints'] + 1), dtype=np.float32) g['viol_counter'] = cl_array.zeros(q, (nsubhists, d['nrestraints'], d['nrestraints']), dtype=np.float32) # complex arrays g['ft_shape'] = list(d['shape']) g['ft_shape'][0] = d['shape'][0]//2 + 1 g['ft_rcore'] = cl_array.zeros(q, g['ft_shape'], dtype=np.complex64) g['ft_rsurf'] = cl_array.zeros_like(g['ft_rcore']) g['ft_lsurf'] = cl_array.zeros_like(g['ft_rcore']) g['ft_clashvol'] = cl_array.zeros_like(g['ft_rcore']) g['ft_intervol'] = cl_array.zeros_like(g['ft_rcore']) # other miscellanious data g['nrot'] = d['nrot'] g['max_clash'] = d['max_clash'] g['min_interaction'] = d['min_interaction'] # kernels g['k'] = Kernels(q.context) g['k'].rfftn = pyclfft.RFFTn(q.context, d['shape']) g['k'].irfftn = pyclfft.iRFFTn(q.context, d['shape']) # initial calculations g['k'].rfftn(q, g['rcore'], g['ft_rcore']) g['k'].rfftn(q, g['rsurf'], g['ft_rsurf'])
def gs_mod_gpu(idata,itera=10,osize=256): cut=osize//2 pl=cl.get_platforms()[0] devices=pl.get_devices(device_type=cl.device_type.GPU) ctx = cl.Context(devices=[devices[0]]) queue = cl.CommandQueue(ctx) plan = Plan(idata.shape, queue=queue,dtype=complex128) #no funciona con "complex128" src = str(Template(KERNEL).render( double_support=all( has_double_support(dev) for dev in devices), amd_double_support=all( has_amd_double_support(dev) for dev in devices) )) prg = cl.Program(ctx,src).build() idata_gpu=cl_array.to_device(queue, ifftshift(idata).astype("complex128")) fdata_gpu=cl_array.empty_like(idata_gpu) rdata_gpu=cl_array.empty_like(idata_gpu) plan.execute(idata_gpu.data,fdata_gpu.data) mask=exp(2.j*pi*random(idata.shape)) mask[512-cut:512+cut,512-cut:512+cut]=0 idata_gpu=cl_array.to_device(queue, ifftshift(idata+mask).astype("complex128")) fdata_gpu=cl_array.empty_like(idata_gpu) rdata_gpu=cl_array.empty_like(idata_gpu) error_gpu=cl_array.to_device(ctx, queue, zeros(idata_gpu.shape).astype("double")) plan.execute(idata_gpu.data,fdata_gpu.data) e=1000 ea=1000 for i in range (itera): prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data) plan.execute(fdata_gpu.data,rdata_gpu.data,inverse=True) #~ prg.norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut)) norm1=prg.norm1 norm1.set_scalar_arg_dtypes([None, None, None, int32]) norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut)) e= sqrt(cl_array.sum(error_gpu).get())/(2*cut) #~ if e>ea: #~ #~ break #~ ea=e plan.execute(rdata_gpu.data,fdata_gpu.data) fdata=fdata_gpu.get() fdata=ifftshift(fdata) fdata=exp(1.j*angle(fdata)) return fdata
def test_touch(self): MAX_CLASH = 100 + 0.9 MIN_INTER = 300 + 0.9 NROT = np.random.randint(self.rotations.shape[0] + 1) rotmat = self.rotations[0] cpu_lsurf = np.zeros_like(self.im_lsurf.array) disvis.libdisvis.rotate_image3d(self.im_lsurf.array, self.vlength, np.linalg.inv(rotmat), self.im_center, cpu_lsurf) cpu_clashvol = numpy.fft.irfftn(numpy.fft.rfftn(cpu_lsurf).conj() * numpy.fft.rfftn(self.rcore.array)) gpu_rcore = cl_array.to_device(self.queue, np.asarray(self.rcore.array, dtype=np.float32)) gpu_im_lsurf = cl.image_from_array(self.queue.context, np.asarray(self.im_lsurf.array, dtype=np.float32)) gpu_lsurf = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.kernels.rotate_image3d(self.queue, self.sampler, gpu_im_lsurf, rotmat, gpu_lsurf, self.im_center) gpu_ft_lsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_rcore = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_clashvol = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_clashvol = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.kernels.rfftn(self.queue, gpu_rcore, gpu_ft_rcore) self.kernels.rfftn(self.queue, gpu_lsurf, gpu_ft_lsurf) self.kernels.c_conj_multiply(self.queue, gpu_ft_lsurf, gpu_ft_rcore, gpu_ft_clashvol) self.kernels.irfftn(self.queue, gpu_ft_clashvol, gpu_clashvol) cpu_intervol = numpy.fft.irfftn(numpy.fft.rfftn(cpu_lsurf).conj() * numpy.fft.rfftn(self.rsurf.array)) gpu_rsurf = cl_array.to_device(self.queue, np.asarray(self.rsurf.array, dtype=np.float32)) gpu_ft_rsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_intervol = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_intervol = cl_array.zeros(self.queue, self.shape, dtype=np.float32) cpu_interspace = np.zeros(self.shape, dtype=np.int32) gpu_interspace = cl_array.zeros(self.queue, self.shape, dtype=np.int32) self.kernels.rfftn(self.queue, gpu_rsurf, gpu_ft_rsurf) self.kernels.rfftn(self.queue, gpu_lsurf, gpu_ft_lsurf) self.kernels.c_conj_multiply(self.queue, gpu_ft_lsurf, gpu_ft_rsurf, gpu_ft_intervol) self.kernels.irfftn(self.queue, gpu_ft_intervol, gpu_intervol) self.kernels.touch(self.queue, gpu_clashvol, MAX_CLASH, gpu_intervol, MIN_INTER, gpu_interspace) np.logical_and(cpu_clashvol < MAX_CLASH, cpu_intervol > MIN_INTER, cpu_interspace) disvis.volume.Volume(cpu_interspace, self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('cpu_interspace.mrc') disvis.volume.Volume(gpu_interspace.get(), self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('gpu_interspace.mrc') disvis.volume.Volume(cpu_interspace - gpu_interspace.get(), self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('diff.mrc') print() print(cpu_interspace.sum(), gpu_interspace.get().sum()) print(np.abs(cpu_interspace - gpu_interspace.get()).sum()) self.assertTrue(np.allclose(gpu_interspace.get(), cpu_interspace))
def main(): # Allocate the first GPU ctx = cl.create_some_context(0)#use device 0, the GPU queue = cl.CommandQueue(ctx) # Define dimensions ydim = 1024 xdim = 1024 # Create random matrix matrix = np.random.random((ydim, xdim)) matrix = np.float32(matrix) # Create random matrix2 matrix2 = np.random.random((ydim, xdim)) matrix2 = np.float32(matrix2) # Get the compiled kernel kernel = get_kernel(ctx, xdim) # Start timing t1 = time.time() # Move data to the GPU gpu_matrix = cl_array.to_device(queue, matrix) gpu_matrix2 = cl_array.to_device(queue, matrix2) gpu_result = cl_array.zeros(queue, (ydim, xdim), np.float32) # Define grid shape (the same as the matrix dimensions) grid_shape = (ydim, xdim) # Get group shape based on the matrix dimensions and the actual hardware group_shape = (16,16)#(32,16) # Execute the kernel event = kernel.add(queue, grid_shape, group_shape, gpu_result.data, gpu_matrix.data, gpu_matrix2.data) # Wait for the kernel to finish event.wait() # Move the result from GPU to CPU result = gpu_result.get() # Measure end time t2 = time.time() # Print result and execution time print result print "Elapsed: %f seconds " % (t2-t1) # Free the GPU resource queue.finish()
def test(self): a = numpy.random.randn(4, 4).astype(numpy.float32) b = numpy.random.randn(4, 4).astype(numpy.float32) c = numpy.random.randn(4, 4).astype(numpy.float32) a_gpu = cl_array.to_device(self.ctx, queue, a) b_gpu = cl_array.to_device(self.ctx, queue, b) c_gpu = cl_array.to_device(self.ctx, queue, c) dest_gpu = cl_array.empty_like(a_gpu)
def sum_solutions(self, line_da, x_R_d, x_UH, x_LH, alpha, beta): x_UH_d = cl_array.to_device(self.queue, x_UH) x_LH_d = cl_array.to_device(self.queue, x_LH) alpha_d = cl_array.to_device(self.queue, alpha) beta_d = cl_array.to_device(self.queue, beta) evt = self.sum_solutions_kernel(self.queue, (line_da.nx, line_da.ny, line_da.nz), None, x_R_d.data, x_UH_d.data, x_LH_d.data, alpha_d.data, beta_d.data, np.int32(line_da.nx), np.int32(line_da.ny), np.int32(line_da.nz))
def prepare_dev_data(self): ldis = self.ldis # differentiation matrix drds_dev = np.empty((ldis.Np, ldis.Np, 2), dtype=np.float32) drds_dev[:, :, 0] = ldis.Dr.T drds_dev[:, :, 1] = ldis.Ds.T mf = cl.mem_flags self.diffmatrices_img = cl.Image( self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.RG, cl.channel_type.FLOAT), shape=drds_dev.shape[:2], hostbuf=drds_dev, ) # geometric coefficients drdx_dev = np.empty((self.K, self.dimensions ** 2), dtype=np.float32) drdx_dev[:, 0] = self.rx[:, 0] drdx_dev[:, 1] = self.ry[:, 0] drdx_dev[:, 2] = self.sx[:, 0] drdx_dev[:, 3] = self.sy[:, 0] self.drdx_dev = cl_array.to_device(self.queue, drdx_dev) # lift matrix lift_dev = np.zeros((ldis.Np, ldis.Nfp, 4), dtype=np.float32) partitioned_lift = ldis.LIFT.reshape(ldis.Np, -1, ldis.Nfaces) lift_dev[:, :, : ldis.Nfaces] = partitioned_lift self.lift_img = cl.Image( self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT), shape=(ldis.Nfp, ldis.Np), hostbuf=lift_dev, ) # surface info surfinfo_dev = np.empty((self.K, 6, ldis.Nafp), dtype=np.float32) el_p, face_i_p = divmod(self.vmapP.reshape(-1, ldis.Nafp), ldis.Np) el_m, face_i_m = divmod(self.vmapM.reshape(-1, ldis.Nafp), ldis.Np) ind_p = el_p * self.block_size + face_i_p ind_m = el_m * self.block_size + face_i_m surfinfo_dev[:, 0, :] = ind_m surfinfo_dev[:, 1, :] = ind_p surfinfo_dev[:, 2, :] = self.Fscale surfinfo_dev[:, 3, :] = np.where(ind_m == ind_p, -1, 1) surfinfo_dev[:, 4, :] = self.nx surfinfo_dev[:, 5, :] = self.ny self.surfinfo_dev = cl_array.to_device(self.queue, surfinfo_dev)
def test_multiply(self): np_in1 = np.arange(10, dtype=np.float32) np_in2 = np.arange(10, dtype=np.float32) np_out = np_in1 * np_in2 cl_in1 = cl_array.to_device(self.queue, np_in1) cl_out = cl_array.to_device(self.queue, np.zeros(10, dtype=np.float32)) cl_in2 = cl_array.to_device(self.queue, np_in2) self.k.multiply(cl_in1, cl_in2, cl_out) self.assertTrue(np.allclose(np_out, cl_out.get()))
def _build_block_index(discr, nblks=10, factor=1.0, method='elements', use_tree=True): from pytential.linalg.proxy import ( partition_by_nodes, partition_by_elements) if method == 'elements': factor = 1.0 if method == 'nodes': nnodes = discr.nnodes else: nnodes = discr.mesh.nelements max_particles_in_box = nnodes // nblks # create index ranges if method == 'nodes': indices = partition_by_nodes(discr, use_tree=use_tree, max_nodes_in_box=max_particles_in_box) elif method == 'elements': indices = partition_by_elements(discr, use_tree=use_tree, max_elements_in_box=max_particles_in_box) else: raise ValueError('unknown method: {}'.format(method)) # randomly pick a subset of points if abs(factor - 1.0) > 1.0e-14: with cl.CommandQueue(discr.cl_context) as queue: indices = indices.get(queue) indices_ = np.empty(indices.nblocks, dtype=np.object) for i in range(indices.nblocks): iidx = indices.block_indices(i) isize = int(factor * len(iidx)) isize = max(1, min(isize, len(iidx))) indices_[i] = np.sort( np.random.choice(iidx, size=isize, replace=False)) ranges_ = to_device(queue, np.cumsum([0] + [r.shape[0] for r in indices_])) indices_ = to_device(queue, np.hstack(indices_)) indices = BlockIndexRanges(discr.cl_context, indices_.with_queue(None), ranges_.with_queue(None)) return indices
def test_multiply_array(ctx_getter): """Test the multiplication of two arrays.""" context = ctx_getter() queue = cl.CommandQueue(context) a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32) a_gpu = cl_array.to_device(context, queue, a) b_gpu = cl_array.to_device(context, queue, a) a_squared = (b_gpu*a_gpu).get() assert (a*a == a_squared).all()
def send_arrays_to_device(self, field, field_temp, field_interaction, factor): """ Move numpy arrays onto compute device. """ self.shape = field.shape self.buf_field = cl_array.to_device( self.queue, field.astype(self.np_complex)) self.buf_temp = cl_array.to_device( self.queue, field_temp.astype(self.np_complex)) self.buf_interaction = cl_array.to_device( self.queue, field_interaction.astype(self.np_complex)) self.buf_factor = cl_array.to_device( self.queue, factor.astype(self.np_complex))
def work(x,y,n): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) arr =cl_array.to_device(queue, numpy.zeros(n).astype(numpy.float16)) ris = cl_array.to_device(queue,numpy.zeros(1).astype(numpy.float32)) summ = ElementwiseKernel(ctx, "float a,float b, float *x,float *c ", "c[0]=a+b ") prod = ElementwiseKernel(ctx, "float a,float b, float *x, float *c ", "c[0]=a*b ") summ (x,y,arr,ris) prod(x,y,arr,ris) return ris
def alter_sum(): ctx = cl_init() queue = cl.CommandQueue(ctx) n = 10**6 a_gpu = cl_array.to_device( queue, np.random.randn(n).astype(np.float32)) b_gpu = cl_array.to_device( queue, np.random.randn(n).astype(np.float32)) cl_sum = cl_array.sum(a_gpu).get() numpy_sum = np.sum(a_gpu.get()) print cl_sum, numpy_sum
def test_divide_scalar(ctx_factory): """Test the division of an array and a scalar.""" context = ctx_factory() queue = cl.CommandQueue(context) a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) result = (a_gpu / 2).get() assert (a / 2 == result).all() result = (2 / a_gpu).get() assert (np.abs(2 / a - result) < 1e-5).all()
def _allocate_memory(self, mode): self.mode = mode or "reflect" option_array_names = { "allocate_input_array": "data_in", "allocate_output_array": "data_out", "allocate_tmp_array": "data_tmp", } # Nonseparable transforms do not need tmp array if not (self.separable): self.extra_options["allocate_tmp_array"] = False # Allocate arrays for option_name, array_name in option_array_names.items(): if self.extra_options[option_name]: value = parray.empty(self.queue, self.shape, np.float32) value.fill(np.float32(0.0)) else: value = None setattr(self, array_name, value) if isinstance(self.kernel, np.ndarray): self.d_kernel = parray.to_device(self.queue, self.kernel) else: if not (isinstance(self.kernel, parray.Array)): raise ValueError( "kernel must be either numpy array or pyopencl array") self.d_kernel = self.kernel self._old_input_ref = None self._old_output_ref = None if self.use_textures: self._allocate_textures() self._c_modes_mapping = { "periodic": 2, "wrap": 2, "nearest": 1, "replicate": 1, "reflect": 0, "constant": 3, } mp = self._c_modes_mapping if self.mode.lower() not in mp: raise ValueError(""" Mode %s is not available for textures. Available modes are: %s """ % (self.mode, str(mp.keys()))) # TODO if not (self.use_textures) and self.mode.lower() == "constant": raise NotImplementedError( "mode='constant' is not implemented without textures yet") # self._c_conv_mode = mp[self.mode]
def test_nan_arithmetic(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) def make_nan_contaminated_vector(size): shape = (size, ) a = np.random.randn(*shape).astype(np.float32) from random import randrange for i in range(size // 10): a[randrange(0, size)] = float('nan') return a size = 1 << 20 a = make_nan_contaminated_vector(size) a_gpu = cl_array.to_device(queue, a) b = make_nan_contaminated_vector(size) b_gpu = cl_array.to_device(queue, b) ab = a * b ab_gpu = (a_gpu * b_gpu).get() assert (np.isnan(ab) == np.isnan(ab_gpu)).all()
def process(self, ibuf): if isinstance(ibuf, np.ndarray): ibuf = cla.to_device(self._queue, ibuf) sz = ibuf.shape[0] k2 = False if len(ibuf.shape) > 1: if ibuf.shape[1] == 1: pass elif ibuf.shape[1] == 2: k2 = True else: raise ValueError('invalid dimensionality') max_lag = self._nlags * self._lag_step + self._lag_base max_pre = max_lag + self._win_length offset = max_pre count = sz - offset obuf = cla.empty(self._queue, ((count + self._interval - 1) // self._interval, self._nlags // 64), dtype=splice_point) if k2: ev = self._program.eval_state_2(self._queue, (self._nlags, ), (64, ), ibuf.data, obuf.data, np.int32(offset), self._win_length, self._lag_base, self._lag_step, self._interval, np.int32(count), wait_for=None) else: ev = self._program.eval_state_1(self._queue, (self._nlags, ), (64, ), ibuf.data, obuf.data, np.int32(offset), self._win_length, self._lag_base, self._lag_step, self._interval, np.int32(count), wait_for=None) ev.wait() return obuf.get()
def solve(self, A, B, x0=None, tol=10e-6, iters=300): r""" Solve linear system of equations by a Jacobi iterative method. @param A Linear system matrix. @param B Linear system independent term. @param x0 Initial aproximation of the solution. @param tol Relative error tolerance: \n \$ \vert\vert B - A \, x \vert \vert_\infty / \vert\vert B \vert \vert_\infty \$ @param iters Maximum number of iterations. """ # Create/set OpenCL buffers self.setBuffers(A, B, x0) # Get dimensions for OpenCL execution n = np.uint32(len(B)) gSize = (clUtils.globalSize(n), ) # Get a norm to can compare later for valid result B_cl = cl_array.to_device(self.context, self.queue, B) bnorm2 = self.dot(B_cl, B_cl).get() FreeCAD.Console.PrintMessage(bnorm2) FreeCAD.Console.PrintMessage("\n") # Iterate while the result converges or maximum number # of iterations is reached. for i in range(0, iters): # Compute residues kernelargs = (self.A, self.B, self.X0, self.R.data, n) # Test if the final result has been reached self.program.r(self.queue, gSize, None, *(kernelargs)) rnorm2 = self.dot(self.R, self.R).get() FreeCAD.Console.PrintMessage("\t") FreeCAD.Console.PrintMessage(rnorm2) FreeCAD.Console.PrintMessage("\n") if np.sqrt(rnorm2 / bnorm2) <= tol: break # Iterate kernelargs = (self.A, self.R.data, self.AR.data, n) self.program.dot_mat_vec(self.queue, gSize, None, *(kernelargs)) AR_R = self.dot(self.AR, self.R).get() AR_AR = self.dot(self.AR, self.AR).get() kernelargs = (self.A, self.R.data, self.X, self.X0, AR_R, AR_AR, n) self.program.minres(self.queue, gSize, None, *(kernelargs)) # Swap variables swap = self.X self.X = self.X0 self.X0 = swap # Return result computed x = np.zeros((n), dtype=np.float32) cl.enqueue_read_buffer(self.queue, self.X0, x).wait() return (x, np.sqrt(rnorm2 / bnorm2), i)
def test_divide_inplace_array(ctx_factory): """Test inplace division of arrays.""" context = ctx_factory() queue = cl.CommandQueue(context) dtypes = (np.uint8, np.uint16, np.uint32, np.int8, np.int16, np.int32, np.float32, np.complex64) from pyopencl.characterize import has_double_support if has_double_support(queue.device): dtypes = dtypes + (np.float64, np.complex128) from itertools import product for dtype_a, dtype_b in product(dtypes, repeat=2): print(dtype_a, dtype_b) a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a) b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype_b) a_gpu = cl_array.to_device(queue, a) b_gpu = cl_array.to_device(queue, b) # ensure the same behavior as inplace numpy.ndarray division try: a_gpu /= b_gpu except TypeError: # pass for now, as numpy casts differently for in-place and out-place # true_divide pass # with np.testing.assert_raises(TypeError): # a /= b else: a /= b assert (np.abs(a_gpu.get() - a) < 1e-3).all() assert a_gpu.dtype is a.dtype
def test_ones_matrix_arange_vector(): inp_layer = np.arange(inp_size).astype(np.float32) inp_layer = pycl_array.to_device(clsingle.queue, inp_layer) matrix = clsingle.ones((out_size, inp_size)) out_layer = clsingle.zeros(out_size) code.program.matrix_vector_mul(clsingle.queue, (out_size, TS), (WPT, TS), inp_size, RESET_OUTPUT, inp_layer.data, matrix.data, out_layer.data) approx_val = pytest.approx(np.sum(inp_layer.get())) out_layer = out_layer.get() for i in range(out_size): assert out_layer[i] == approx_val
def shuffle(x_data, rows, cols): """ Odd sized row count will not have 1 row shuffled :param x_data: :param rows: :param cols: :param swaps_g: :return: """ swaps_np = np.arange(rows, dtype=cltypes.uint) np.random.shuffle(swaps_np) swaps_g = array.to_device(queue, swaps_np, allocator=read_only_arr) e1 = shuffle_krnl(queue, (cols, len(swaps_np) // 2), None, x_data, swaps_g.data) e1.wait() return swaps_g
def cg_solve(self, x, iters): x = clarray.to_device(self.queue, np.require(x, requirements="C")) b = clarray.empty(self.queue, (self.NScan, 1, self.NSlice, self.dimY, self.dimX), DTYPE, "C") Ax = clarray.empty(self.queue, (self.NScan, 1, self.NSlice, self.dimY, self.dimX), DTYPE, "C") data = clarray.to_device(self.queue, self.data) self.operator_rhs(b, data) res = b p = res delta = np.linalg.norm(res.get())**2/np.linalg.norm(b.get())**2 self.res.append(delta) print("Initial Residuum: ", delta) for i in range(iters): self.operator_lhs(Ax, p) Ax = Ax + self.reco_par["lambd"]*p alpha = (clarray.vdot(res, res)/(clarray.vdot(p, Ax))).real.get() x[i+1] = (x[i] + alpha*p) res_new = res - alpha*Ax delta = np.linalg.norm(res_new.get())**2/np.linalg.norm(b.get())**2 self.res.append(delta) if delta < self.reco_par["tol"]: print("Converged after %i iterations to %1.3e." % (i, delta)) return x.get()[:i+1, ...] if not np.mod(i, 1): print("Residuum at iter %i : %1.3e" % (i, delta), end='\r') beta = (clarray.vdot(res_new, res_new) / clarray.vdot(res, res)).real.get() p = res_new+beta*p (res, res_new) = (res_new, res) return x.get()
def test_outoforderqueue_reductions(ctx_factory): context = ctx_factory() try: queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") # 0/1 values to avoid accumulated rounding error a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32')) a[800000] = 10 # all<5 looks true until near the end a_gpu = cl_array.to_device(queue, a) b1 = cl_array.sum(a_gpu).get() b2 = cl_array.dot(a_gpu, 3 - a_gpu).get() b3 = (a_gpu < 5).all().get() assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0
def test_adj_inplace(self): inpgrad = clarray.to_device(self.queue, self.symgradin) inpdiv = clarray.to_device(self.queue, self.symdivin) outgrad = clarray.zeros_like(inpdiv) outdiv = clarray.zeros_like(inpgrad) outgrad.add_event(self.symgrad.fwd(outgrad, inpgrad)) outdiv.add_event(self.symgrad.adj(outdiv, inpdiv)) outgrad = outgrad.get() outdiv = outdiv.get() a1 = np.vdot(outgrad[..., :3].flatten(), self.symdivin[..., :3].flatten())/self.symgradin.size*4 a2 = 2*np.vdot(outgrad[..., 3:6].flatten(), self.symdivin[..., 3:6].flatten())/self.symgradin.size*4 a = a1+a2 b = np.vdot(self.symgradin.flatten(), -outdiv.flatten())/self.symgradin.size*4 print("Adjointness: %.2e +1j %.2e" % ((a - b).real, (a - b).imag)) np.testing.assert_allclose(a, b, rtol=RTOL, atol=ATOL)
def test_1d_out_of_place(self, ctx): queue = cl.CommandQueue(ctx) nd_data = np.arange(32, dtype=np.complex64) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) transform = FFT(ctx, queue, cl_data, cl_data_transformed ) transform.enqueue() assert np.allclose(cl_data_transformed.get(), np.fft.fft(nd_data))
def setupClVariables(self): self.nrOfDetectionAngleSteps = self.configReader.nrOfDetectionAngleSteps self.host_mostRecentMembraneCoordinatesX = np.zeros( shape=self.nrOfDetectionAngleSteps, dtype=np.float64) self.dev_mostRecentMembraneCoordinatesX = cl_array.to_device( self.managementQueue, self.host_mostRecentMembraneCoordinatesX) self.host_mostRecentMembraneCoordinatesY = np.zeros( shape=self.nrOfDetectionAngleSteps, dtype=np.float64) self.dev_mostRecentMembraneCoordinatesY = cl_array.to_device( self.managementQueue, self.host_mostRecentMembraneCoordinatesY) self.host_mostRecentMembraneNormalVectorsX = np.zeros( shape=self.nrOfDetectionAngleSteps, dtype=np.float64) self.dev_mostRecentMembraneNormalVectorsX = cl_array.to_device( self.managementQueue, self.host_mostRecentMembraneNormalVectorsX) self.host_mostRecentMembraneNormalVectorsY = np.zeros( shape=self.nrOfDetectionAngleSteps, dtype=np.float64) self.dev_mostRecentMembraneNormalVectorsY = cl_array.to_device( self.managementQueue, self.host_mostRecentMembraneNormalVectorsY) self.host_contourCenter = np.zeros(1, cl.array.vec.double2) self.dev_mostRecentContourCenter = cl_array.to_device( self.managementQueue, self.host_contourCenter) pass
def __init__(self, data: Union[cl.array.Array, list, np.ndarray], gpu: bool = False) -> None: """Initialize variables.""" self._gpu: bool = gpu if isinstance(data, list): self._data: np.ndarray = np.array(data, dtype=np.float32) if self._gpu: self._data = clarray.to_device(QUEUE, self._data) elif isinstance(data, np.ndarray): if data.dtype != np.float32: # NOTE: The NumPy array has to be converted into a list first. # Otherwise, the operations on cpu and gpu produce # different results. This behavior can be caused by many # reasons including OpenCL and even the operating system # itself. Some research is needed to figure out cause and # eliminate extra work for rebuilding the array. self._data: np.ndarray = np.array(data.tolist(), np.float32) else: self._data: np.ndarray = data if self._gpu: self._data = clarray.to_device(QUEUE, self._data) elif isinstance(data, cl.array.Array): self._data: cl.array.Array = data self._gpu: bool = True else: raise TypeError( "Expected `list`, `np.ndarray`, or `pyopencl.array.Array` got " f"`{type(data)}`")
def __init__(self, ctx, queue, shape, coeffs): ''' Create context for the Cyclic Reduction Solver that solves a "near-toeplitz" tridiagonal system with diagonals: a = (_, ai, ai .... an) b[:] = (b1, bi, bi, bi... bn) c[:] = (c1, ci, ci, ... _) Parameters ---------- ctx: PyOpenCL context queue: PyOpenCL command queue shape: The size of the tridiagonal system. coeffs: A list of coefficients that make up the tridiagonal matrix: [b1, c1, ai, bi, ci, an, bn] ''' self.ctx = ctx self.queue = queue self.device = self.ctx.devices[0] self.platform = self.device.platform self.nz, self.ny, self.nx = shape self.coeffs = coeffs mf = cl.mem_flags # check that system_size is a power of 2: assert np.int(np.log2(self.nx)) == np.log2(self.nx) # compute coefficients a, b, etc., a, b, c, k1, k2, b_first, k1_first, k1_last = self._precompute_coefficients( ) self.a_d = cl_array.to_device(queue, a) self.b_d = cl_array.to_device(queue, b) self.c_d = cl_array.to_device(queue, c) self.k1_d = cl_array.to_device(queue, k1) self.k2_d = cl_array.to_device(queue, k2) self.b_first_d = cl_array.to_device(queue, b_first) self.k1_first_d = cl_array.to_device(queue, k1_first) self.k1_last_d = cl_array.to_device(queue, k1_last) self.forward_reduction, self.back_substitution = kernels.get_funcs( self.ctx, 'kernels.cl', 'globalForwardReduction', 'globalBackSubstitution')
def clfftn(data): """ OpenCL FFT 3D """ clear_first_arg_caches() #ctx = cl.create_some_context(interactive=False) #queue = cl.CommandQueue(ctx) ctx, queue = clinit() plan = Plan(data.shape, normalize=True, queue=queue) # forward transform on device gpu_data = cl_array.to_device(queue, data) # forward transform plan.execute(gpu_data.data) #result = gpu_data.get() result = gpu_data.get() return result
def mhd_gamma_calc(queue, G, P, loc=Loci.CENT, out=None): """Find relativistic gamma-factor w.r.t. normal observer""" s = G.slices sh = G.shapes global g3 if g3 is None: g3 = cl_array.to_device(queue, G.gcov[loc.value, 1:, 1:].copy()) if out is None: out = cl_array.empty(queue, sh.grid_scalar, dtype=np.float64) evt, _ = G.dot2geom2(queue, g=g3, u=P[s.U3VEC], v=P[s.U3VEC], out=out) out = clm.sqrt(1. + out) return out
def test_zero_size_array(ctx_factory, empty_shape): context = ctx_factory() queue = cl.CommandQueue(context) if queue.device.platform.name == "Intel(R) OpenCL": pytest.xfail("size-0 arrays fail on Intel CL") a = cl_array.zeros(queue, empty_shape, dtype=np.float32) b = cl_array.zeros(queue, empty_shape, dtype=np.float32) b.fill(1) c = a + b c_host = c.get() cl_array.to_device(queue, c_host) assert c.flags.c_contiguous == c_host.flags.c_contiguous assert c.flags.f_contiguous == c_host.flags.f_contiguous for order in "CF": c_flat = c.reshape(-1, order=order) c_host_flat = c_host.reshape(-1, order=order) assert c_flat.shape == c_host_flat.shape assert c_flat.strides == c_host_flat.strides assert c_flat.flags.c_contiguous == c_host_flat.flags.c_contiguous assert c_flat.flags.f_contiguous == c_host_flat.flags.f_contiguous
def __init__(self, ary, backend=None): self.backend = get_backend(backend) self.data = ary self._convert = False if self.backend == 'opencl': use_double = get_config().use_double self._dtype = np.float64 if use_double else np.float32 if np.issubdtype(self.data.dtype, np.float): self._convert = True from pyopencl.array import to_device from .opencl import get_queue self.q = get_queue() self.dev = to_device(self.q, self._get_data()) else: self.dev = self.data
def project_metaballs_naive(metaballs, shape, pixel_size, offset=None, z_step=None, queue=None, out=None, block=False): """Project a list of :class:`.MetaBall` on an image plane with *shape*, *pixel_size*. *z_step* is the physical step in the z-dimension, if not specified it is the same as *pixel_size*. *offset* is the physical spatial body offset as (y, x). Use OpenCL *queue* and *out* pyopencl Array instance for returning the result. If *block* is True, wait for the kernel to finish. """ def get_extrema(sgn): func = np.max if sgn > 0 else np.min x_ps = util.make_tuple(pixel_size)[1] res = [(ball.position[2] + sgn * (2 * ball.radius + x_ps)).simplified.magnitude for ball in metaballs] return func(res) if offset is None: offset = (0, 0) * q.m if not queue: queue = cfg.OPENCL.queue if out is None: out = cl_array.Array(queue, shape, cfg.PRECISION.np_float) string = ''.join([body.pack() for body in metaballs]) data = np.fromstring(string, dtype=np.float32) data = cl_array.to_device(queue, data) n, m = shape ps = util.make_tuple(pixel_size.simplified.magnitude) z_step = ps[1] if z_step is None else z_step.simplified.magnitude z_range = get_extrema(-1), get_extrema(1) offset = g_util.make_vfloat2(*offset.simplified.magnitude[::-1]) ev = cfg.OPENCL.programs['geometry'].naive_metaballs( cfg.OPENCL.queue, (m, n), None, out.data, data.data, np.int32(len(metaballs)), offset, g_util.make_vfloat2(*z_range), cfg.PRECISION.np_float(z_step), g_util.make_vfloat2(*ps[::-1]), np.int32(True)) if block: ev.wait() return out
def test_hankel_01_complex(ctx_factory, ref_src): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if not has_double_support(ctx.devices[0]): from pytest import skip skip( "no double precision support--cannot test complex bessel function") n = 10**6 np.random.seed(11) z = (np.logspace(-5, 2, n) * np.exp(1j * 2 * np.pi * np.random.rand(n))) def get_err(check, ref): return np.max(np.abs(check - ref)) / np.max(np.abs(ref)) if ref_src == "pyfmmlib": pyfmmlib = pytest.importorskip("pyfmmlib") h0_ref, h1_ref = pyfmmlib.hank103_vec(z, ifexpon=1) elif ref_src == "scipy": spec = pytest.importorskip("scipy.special") h0_ref = spec.hankel1(0, z) h1_ref = spec.hankel1(1, z) else: raise ValueError("ref_src") z_dev = cl_array.to_device(queue, z) h0_dev, h1_dev = clmath.hankel_01(z_dev) rel_err_h0 = np.abs(h0_dev.get() - h0_ref) / np.abs(h0_ref) rel_err_h1 = np.abs(h1_dev.get() - h1_ref) / np.abs(h1_ref) max_rel_err_h0 = np.max(rel_err_h0) max_rel_err_h1 = np.max(rel_err_h1) print("H0", max_rel_err_h0) print("H1", max_rel_err_h1) assert max_rel_err_h0 < 4e-13 assert max_rel_err_h1 < 2e-13 if 0: import matplotlib.pyplot as pt pt.loglog(np.abs(z), rel_err_h0) pt.loglog(np.abs(z), rel_err_h1) pt.show()
def step(self, delta_time): if pause: return centers = np.ndarray((self.galaxy_count, 4), dtype=np.float32) for i in range(self.galaxy_count): centers[i][:3] = self.galaxies[i].position centers[i][3] = self.galaxies[i].mass centers_buffer = clarray.to_device(cl_queue, centers) gl.glFlush() gl.glFinish() for i, galaxy in enumerate(self.galaxies): cl.enqueue_acquire_gl_objects(cl_queue, [ galaxy.body_positions_cl_buffer, galaxy.body_velocities_cl_buffer ]) kernel_step(cl_queue, (galaxy.body_count, ), None, galaxy.body_positions_cl_buffer, galaxy.body_velocities_cl_buffer, centers_buffer.data, np.uint(galaxy.body_count), np.uint(self.galaxy_count), np.float32(self.dt * delta_time), np.float32(self.G)) cl.enqueue_release_gl_objects(cl_queue, [ galaxy.body_positions_cl_buffer, galaxy.body_velocities_cl_buffer ]) cl_queue.finish() centers = [ mathutils.Vector((galaxy.position.x, galaxy.position.y, galaxy.position.z, galaxy.mass)) for galaxy in self.galaxies ] for i in range(self.galaxy_count): this_galaxy = self.galaxies[i] f = mathutils.Vector((0, 0, 0)) #f = np.zeros((4,), dtype=np.float32) for j in self.others(i): delta_pos = mathutils.Vector(centers[j] - centers[i]).xyz length = max(1.0, delta_pos.length_squared) f += delta_pos.normalized() * self.G * centers[i][3] * centers[ j][3] / delta_pos.length_squared this_galaxy.velocity += f * delta_time * self.dt this_galaxy.position += this_galaxy.velocity * delta_time * self.dt
def setUp(self): parser = tmpArgs() parser.streamed = False parser.devices = -1 parser.use_GPU = True par = {} pyqmri.pyqmri._setupOCL(parser, par) setupPar(par) if DTYPE == np.complex128: file = resource_filename( 'pyqmri', 'kernels/OpenCL_Kernels_double.c') else: file = resource_filename( 'pyqmri', 'kernels/OpenCL_Kernels.c') prg = [] for j in range(len(par["ctx"])): with open(file) as myfile: prg.append(Program( par["ctx"][j], myfile.read())) prg = prg[0] self.op = pyqmri.operator.OperatorImagespace( par, prg, DTYPE=DTYPE, DTYPE_real=DTYPE_real) self.opinfwd = np.random.randn(par["unknowns"], par["NSlice"], par["dimY"], par["dimX"]) +\ 1j * np.random.randn(par["unknowns"], par["NSlice"], par["dimY"], par["dimX"]) self.opinadj = np.random.randn(par["NScan"], 1, par["NSlice"], par["dimY"], par["dimX"]) +\ 1j * np.random.randn(par["NScan"], 1, par["NSlice"], par["dimY"], par["dimX"]) self.model_gradient = np.random.randn(par["unknowns"], par["NScan"], par["NSlice"], par["dimY"], par["dimX"]) + \ 1j * np.random.randn(par["unknowns"], par["NScan"], par["NSlice"], par["dimY"], par["dimX"]) self.model_gradient = self.model_gradient.astype(DTYPE) self.opinfwd = self.opinfwd.astype(DTYPE) self.opinadj = self.opinadj.astype(DTYPE) self.queue = par["queue"][0] self.grad_buf = clarray.to_device(self.queue, self.model_gradient)
def transfer(thickness, refractive_index, wavelength, exponent=False, queue=None, out=None, check=True, block=False): """Transfer *thickness* (can be either a numpy or pyopencl array) with *refractive_index* and given *wavelength*. If *exponent* is True, compute the exponent of the function without applying the wavenumber. Use command *queue* for computation and *out* pyopencl array. If *block* is True, wait for the kernel to finish. If *check* is True, the function is checked for aliasing artefacts. Returned *out* array is different from the input one because of the pyopencl.clmath behavior. """ if queue is None: queue = cfg.OPENCL.queue if isinstance(thickness, cl_array.Array): thickness_mem = thickness else: prep = thickness.simplified.magnitude.astype(cfg.PRECISION.np_float) thickness_mem = cl_array.to_device(queue, prep) if out is None: out = cl_array.Array(queue, thickness_mem.shape, cfg.PRECISION.np_cplx) if exponent or check: wavenumber = cfg.PRECISION.np_float(2 * np.pi / wavelength.simplified.magnitude) ev = cfg.OPENCL.programs['physics'].transmission_add( queue, thickness_mem.shape[::-1], None, out.data, thickness_mem.data, cfg.PRECISION.np_cplx(refractive_index), wavenumber, np.int32(1)) if check and not is_wavefield_sampling_ok(out, queue=queue): LOG.error('Insufficient transmission function sampling') if not exponent: # Apply the exponent out = clmath.exp(out, queue=queue) else: ev = cfg.OPENCL.programs['physics'].transfer( queue, thickness_mem.shape[::-1], None, out.data, thickness_mem.data, cfg.PRECISION.np_cplx(refractive_index), cfg.PRECISION.np_float(wavelength.simplified.magnitude)) if block: ev.wait() return out
def test_outoforderqueue_copy(ctx_factory): context = ctx_factory() try: queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") a = np.random.rand(10**6).astype(np.dtype('float32')) a_gpu = cl_array.to_device(queue, a) c_gpu = a_gpu**2 - 7 b_gpu = c_gpu.copy() # testing that this waits for and creates events b_gpu *= 10 queue.finish() b1 = b_gpu.get() b = 10 * (a**2 - 7) assert np.abs(b1 - b).mean() < 1e-5
def probabilities(self): """Gets the squared absolute value of each of the amplitudes""" out = pycl_array.to_device( self.queue, np.zeros(2**self.num_qubits, dtype=np.float32) ) program.calculate_probabilities( self.queue, out.shape, None, self.buffer.data, out.data ) return out.get()
def _init_reference_field(self, scale_ref=1): # clear object patches for subfield, mask in zip(self.object_multiareafield.subfields, self.object_multiareafield.subfields_masks): np.copyto(subfield.field, 0, where=mask) # obtain reference field self.propagator_object_to_farfield.propagator_full_field.propagate() self.object_field_ref = self.object_multiareafield.field.copy() self.far_field.field *= scale_ref self.far_field_ref = self.far_field.copy() self.cl_far_field_ref = cla.to_device(self.cl_queue, self.far_field_ref.field, allocator=self.cl_allocator)
def test_grad_outofplace(self): gradx = np.zeros_like(self.gradin) grady = np.zeros_like(self.gradin) gradz = np.zeros_like(self.gradin) gradx[..., :-1] = np.diff(self.gradin, axis=-1) grady[..., :-1, :] = np.diff(self.gradin, axis=-2) gradz[:, :-1, ...] = np.diff(self.gradin, axis=-3) * self.dz grad = np.stack((gradx, grady, gradz), axis=-1) inp = clarray.to_device(self.queue, self.gradin) outp = self.grad.fwdoop(inp) outp = outp.get() np.testing.assert_allclose(outp[..., :-1], grad, rtol=RTOL, atol=ATOL)
def test_outoforderqueue_clmath(ctx_factory): context = ctx_factory() try: queue = cl.CommandQueue(context, properties=cl.command_queue_properties. OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") a = np.random.rand(10**6).astype(np.dtype('float32')) a_gpu = cl_array.to_device(queue, a) # testing that clmath functions wait for and create events b_gpu = clmath.fabs(clmath.sin(a_gpu * 5)) queue.finish() b1 = b_gpu.get() b = np.abs(np.sin(a * 5)) assert np.abs(b1 - b).mean() < 1e-5
def test_identity_matrix_random_vector(): inp_layer = clsingle.random(inp_size) matrix = [np.arange(out_size) == i for i in range(out_size)] matrix = np.array(matrix).astype(np.float32) matrix = pycl_array.to_device(clsingle.queue, matrix) out_layer = clsingle.ones(out_size) code.program.matrix_vector_mul(clsingle.queue, (out_size, TS), (WPT, TS), inp_size, RESET_OUTPUT, inp_layer.data, matrix.data, out_layer.data) out_layer = out_layer.get() inp_layer = inp_layer.get() for i in range(out_size): assert out_layer[i] == pytest.approx(inp_layer[i])