def test_sum(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) n = 200000 for dtype in [np.float32, np.complex64]: a_gpu = general_clrand(queue, (n,), dtype) a = a_gpu.get() for slc in [ slice(None), slice(1000, 3000), slice(1000, -3000), slice(1000, None), slice(1000, None, 3), slice(1000, 1000), ]: sum_a = np.sum(a[slc]) if sum_a: ref_divisor = abs(sum_a) else: ref_divisor = 1 if slc.step is None: sum_a_gpu = cl_array.sum(a_gpu[slc]).get() assert abs(sum_a_gpu - sum_a) / ref_divisor < 1e-4 sum_a_gpu_2 = cl_array.sum(a_gpu, slice=slc).get() assert abs(sum_a_gpu_2 - sum_a) / ref_divisor < 1e-4
def test_sum(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) n = 200000 for dtype in [np.float32, np.complex64]: a_gpu = general_clrand(queue, (n,), dtype) a = a_gpu.get() for slc in [ slice(None), slice(1000, 3000), slice(1000, -3000), slice(1000, None), slice(1000, None, 3), ]: sum_a = np.sum(a[slc]) if slc.step is None: sum_a_gpu = cl_array.sum(a_gpu[slc]).get() assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4 sum_a_gpu_2 = cl_array.sum(a_gpu, slice=slc).get() assert abs(sum_a_gpu_2 - sum_a) / abs(sum_a) < 1e-4
def _preserve_power(self, real, imag, speckles, stage): assert stage in ('in','out'), "unrecognized _preserve_power stage %s"%stage if stage == 'in': # make the components into get m0 and the speckle power. # m0 is a float2 which stores the (0,0) of re and im # replace the (0,0) component with the average of the surrounding neighbors to prevent envelope distortion due to non-zero average magnetization self.get_m0.execute(self.queue,(1,), real.data, imag.data, self.m0_1.data) self.make_speckle(real,imag, speckles) self.power_in = cla.sum(speckles).get()-(self.m0_1.get()[0])**2 self.replace_dc_component1.execute(self.queue,(1,), speckles.data, speckles.data, np.int32(self.N)) if stage == 'out': # preserve the total amount of speckle power outside the (0,0) component # put m0_1 back as the (0,0) component so that the average magnetization is not effected by the rescaling self.get_m0.execute(self.queue,(1,), real.data, imag.data, self.m0_2.data) self.make_speckle(real, imag, speckles) self.power_out = cla.sum(speckles).get()-(self.m0_2.get()[0])**2 ratio = (np.sqrt(self.power_in/self.power_out)).astype(np.float32) self.scalar_multiply(ratio,real) self.scalar_multiply(ratio,imag) self.replace_dc_component2.execute(self.queue,(1,), real.data, imag.data, self.m0_1.data)
def sum(ary, backend=None): if backend is None: backend = ary.backend if backend == 'cython': return np.sum(ary.dev) if backend == 'opencl': import pyopencl.array as gpuarray return gpuarray.sum(ary.dev).get() if backend == 'cuda': import pycuda.gpuarray as gpuarray return gpuarray.sum(ary.dev).get()
def pol_vor_rho(self, Y, px, py): '''return: polarization, vorticity, density at given Y,px,py''' self.prg.polarization_on_sf(self.queue, (self.size_sf, ), None, self.d_pol.data, self.d_vor.data, self.d_rho.data, self.d_smu, self.d_umu, self.d_omegaY, self.d_etas, np.float32(Y), np.float32(px), np.float32(py), np.int32(self.size_sf)).wait() polarization = cl_array.sum(self.d_pol).get() vorticity = cl_array.sum(self.d_vor).get() density = cl_array.sum(self.d_rho).get() return polarization, vorticity, density
def get_divergence_error(vector): for mu in range(3): fft.idft(vector[mu], vector_x[mu]) derivs.divergence(queue, vector_x, div) derivs(queue, fx=vector_x[0], pdx=pdx[0]) derivs(queue, fx=vector_x[1], pdy=pdx[1]) derivs(queue, fx=vector_x[2], pdz=pdx[2]) norm = sum([clm.fabs(pdx[mu]) for mu in range(3)]) max_err = cla.max(clm.fabs(div)) / cla.max(norm) avg_err = cla.sum(clm.fabs(div)) / cla.sum(norm) return max_err, avg_err
def get_total_energy_and_entropy_on_gpu(self, tau, d_ev): NX, NY, NZ = self.cfg.NX, self.cfg.NY, self.cfg.NZ self.kernel_bulk.total_energy_and_entropy(self.queue, (NX, NY, NZ), None, self.a_ed.data, self.a_entropy.data, d_ev, self.eos_table, np.float32(tau)).wait() volum = tau * self.cfg.DX * self.cfg.DY * self.cfg.DZ e_total = cl_array.sum(self.a_ed).get() * volum s_total = cl_array.sum(self.a_entropy).get() * volum self.energy.append(e_total) self.entropy.append(s_total)
def __call__(self, im, nrays, nsamples, ray_step, seed_pt, cutoff, thresh): nrays = int(nrays) nsamples = int(nsamples) cutoff = np.int32(cutoff) arrays = self.setup_arrays(nrays, nsamples, cutoff) prog = self.build_program(nrays, nsamples, ray_step) prog.sample_rays(self.queue, (nsamples, nrays), None, arrays.scratch.data, im, np.float32(seed_pt[0]), np.float32(seed_pt[1])) # take the region in the cutoff zone cla.take(arrays.scratch, arrays.idx, out=arrays.pre_cutoff) # plt.imshow(self.pre_cutoff.get()) # plt.show() self.square_array(arrays.pre_cutoff, arrays.pre_cutoff_squared) inside_mean = cla.sum(arrays.pre_cutoff).get() / (cutoff * nrays) inside_sumsq = cla.sum(arrays.pre_cutoff_squared).get() / (cutoff * nrays) inside_std = np.sqrt(inside_sumsq - inside_mean ** 2) normed_thresh = inside_std * thresh prog.scan_boundary(self.queue, (nrays,), None, arrays.result.data, arrays.scratch.data, np.float32(normed_thresh)) # print normed_thresh # plt.figure() # plt.hold(True) # plt.imshow(arrays.scratch.get()) # plt.plot(np.arange(0, nrays), arrays.result.get()) # plt.show() return arrays.result.get()
def mean(t: Tensor) -> np.float32: """The mean of the values in a tensor.""" if t.gpu: return clarray.sum(t._data).get().flat[0] / t._data.size return np.mean(t._data)
def sum(t: Tensor) -> np.float32: """The sum of the values in a tensor.""" if t.gpu: return clarray.sum(t._data).get().flat[0] return np.sum(t._data)
def _calcResidual(self, step_out, tmp_results, step_in, data): f_new = clarray.vdot(tmp_results["DADA"], tmp_results["DAd"]) + clarray.sum( self.lambd * clmath.log(1 + clarray.vdot(tmp_results["gradx"], tmp_results["gradx"])) ) # TODO: calculate on GPU f_new = np.linalg.norm(f_new.get()) grad_f = np.linalg.norm(tmp_results["gradFx"].get()) # TODO: datacosts calculate or get from outside!!!! # datacost = 0 # self._fval_init # TODO: calculate on GPU datacost = 2 * np.linalg.norm(tmp_results["Ax"] - data) ** 2 # datacost = 2 * np.linalg.norm(data - b) ** 2 # self._FT.FFT(b, clarray.to_device( # self._queue[0], (self._step_val[:, None, ...] * # self.par["C"]))).wait() # b = b.get() # datacost = 2 * np.linalg.norm(data - b) ** 2 # TODO: calculate on GPU L2Cost = np.linalg.norm(step_out["x"].get()) / (2.0 * self.delta) regcost = self.lambd * np.sum( np.abs( clmath.log( 1 + clarray.vdot(tmp_results["gradx"], tmp_results["gradx"]) ).get() ) ) costs = datacost + L2Cost + regcost return costs, f_new, grad_f
def gs_mod_gpu(idata, itera=10, osize=256): cut = osize // 2 pl = cl.get_platforms()[0] devices = pl.get_devices(device_type=cl.device_type.GPU) ctx = cl.Context(devices=[devices[0]]) queue = cl.CommandQueue(ctx) plan = Plan(idata.shape, queue=queue, dtype=complex128) #no funciona con "complex128" src = str( Template(KERNEL).render( double_support=all(has_double_support(dev) for dev in devices), amd_double_support=all( has_amd_double_support(dev) for dev in devices))) prg = cl.Program(ctx, src).build() idata_gpu = cl_array.to_device(queue, ifftshift(idata).astype("complex128")) fdata_gpu = cl_array.empty_like(idata_gpu) rdata_gpu = cl_array.empty_like(idata_gpu) plan.execute(idata_gpu.data, fdata_gpu.data) mask = exp(2.j * pi * random(idata.shape)) mask[512 - cut:512 + cut, 512 - cut:512 + cut] = 0 idata_gpu = cl_array.to_device( queue, ifftshift(idata + mask).astype("complex128")) fdata_gpu = cl_array.empty_like(idata_gpu) rdata_gpu = cl_array.empty_like(idata_gpu) error_gpu = cl_array.to_device(ctx, queue, zeros(idata_gpu.shape).astype("double")) plan.execute(idata_gpu.data, fdata_gpu.data) e = 1000 ea = 1000 for i in range(itera): prg.norm(queue, fdata_gpu.shape, None, fdata_gpu.data) plan.execute(fdata_gpu.data, rdata_gpu.data, inverse=True) #~ prg.norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut)) norm1 = prg.norm1 norm1.set_scalar_arg_dtypes([None, None, None, int32]) norm1(queue, rdata_gpu.shape, None, rdata_gpu.data, idata_gpu.data, error_gpu.data, int32(cut)) e = sqrt(cl_array.sum(error_gpu).get()) / (2 * cut) #~ if e>ea: #~ #~ break #~ ea=e plan.execute(rdata_gpu.data, fdata_gpu.data) fdata = fdata_gpu.get() fdata = ifftshift(fdata) fdata = exp(1.j * angle(fdata)) return fdata
def gs_mod_gpu(idata,itera=10,osize=256): cut=osize//2 pl=cl.get_platforms()[0] devices=pl.get_devices(device_type=cl.device_type.GPU) ctx = cl.Context(devices=[devices[0]]) queue = cl.CommandQueue(ctx) plan = Plan(idata.shape, queue=queue,dtype=complex128) #no funciona con "complex128" src = str(Template(KERNEL).render( double_support=all( has_double_support(dev) for dev in devices), amd_double_support=all( has_amd_double_support(dev) for dev in devices) )) prg = cl.Program(ctx,src).build() idata_gpu=cl_array.to_device(queue, ifftshift(idata).astype("complex128")) fdata_gpu=cl_array.empty_like(idata_gpu) rdata_gpu=cl_array.empty_like(idata_gpu) plan.execute(idata_gpu.data,fdata_gpu.data) mask=exp(2.j*pi*random(idata.shape)) mask[512-cut:512+cut,512-cut:512+cut]=0 idata_gpu=cl_array.to_device(queue, ifftshift(idata+mask).astype("complex128")) fdata_gpu=cl_array.empty_like(idata_gpu) rdata_gpu=cl_array.empty_like(idata_gpu) error_gpu=cl_array.to_device(ctx, queue, zeros(idata_gpu.shape).astype("double")) plan.execute(idata_gpu.data,fdata_gpu.data) e=1000 ea=1000 for i in range (itera): prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data) plan.execute(fdata_gpu.data,rdata_gpu.data,inverse=True) #~ prg.norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut)) norm1=prg.norm1 norm1.set_scalar_arg_dtypes([None, None, None, int32]) norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut)) e= sqrt(cl_array.sum(error_gpu).get())/(2*cut) #~ if e>ea: #~ #~ break #~ ea=e plan.execute(rdata_gpu.data,fdata_gpu.data) fdata=fdata_gpu.get() fdata=ifftshift(fdata) fdata=exp(1.j*angle(fdata)) return fdata
def get_divergence_errors(hij): max_errors = [] avg_errors = [] for i in range(1, 4): for mu in range(3): fft.idft(hij[tensor_id(i, mu + 1)], vector_x[mu]) derivs.divergence(queue, vector_x, div) derivs(queue, fx=vector_x[0], pdx=pdx[0]) derivs(queue, fx=vector_x[1], pdy=pdx[1]) derivs(queue, fx=vector_x[2], pdz=pdx[2]) norm = sum([clm.fabs(pdx[mu]) for mu in range(3)]) max_errors.append(cla.max(clm.fabs(div)) / cla.max(norm)) avg_errors.append(cla.sum(clm.fabs(div)) / cla.sum(norm)) return np.array(max_errors), np.array(avg_errors)
def minZerrKernSHG_gpu(self): krn = self.progs.progs["minZerrSHG"].minZerrSHG krn.set_scalar_arg_dtypes((None, None, None, None, None, None, None, None, np.int32)) krn.set_args( self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.dZ_cla.data, self.X0_cla.data, self.X1_cla.data, self.X2_cla.data, self.X3_cla.data, self.X4_cla.data, self.N, ) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None) ev.wait() krn = self.progs.progs["normEsig"].normEsig krn.set_scalar_arg_dtypes((None, None, np.int32)) krn.set_args(self.Esig_t_tau_p_cla.data, self.Esig_t_tau_norm_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Esig_t_tau_p.shape, None) ev.wait() mx = cla.max(self.Esig_t_tau_norm_cla).get() * self.N * self.N # Esig_t_tau = self.Esig_t_tau_p_cla.get() # mx = ((Esig_t_tau*Esig_t_tau.conj()).real).max() * self.N*self.N X0 = cla.sum(self.X0_cla, queue=self.q).get() / mx X1 = cla.sum(self.X1_cla, queue=self.q).get() / mx X2 = cla.sum(self.X2_cla, queue=self.q).get() / mx X3 = cla.sum(self.X3_cla, queue=self.q).get() / mx X4 = cla.sum(self.X4_cla, queue=self.q).get() / mx root.debug("".join(("X0=", str(X0), ", type ", str(type(X0))))) root.debug( "".join(("Poly: ", str(X4), " x^4 + ", str(X3), " x^3 + ", str(X2), " x^2 + ", str(X1), " x + ", str(X0))) ) # Polynomial in dZ (expansion of differential) X = np.array([X0, X1, X2, X3, X4]).astype(np.double) root.debug("".join(("Esig_t_tau_p norm max: ", str(mx / (self.N * self.N))))) return X
def _cl_count_complexes(self, weight): # Count all sampled complexes self._cl_tot_complex += cl_array.sum(self._cl_interspace, dtype=np.dtype(np.float32)) * weight self._cl_kernels.set_to_i32(np.int32(0), self._cl_hist) self._cl_kernels.histogram(self.queue, self._cl_red_interspace, self._cl_hist) self._cl_kernels.multiply_add(self._cl_hist, weight, self._cl_consistent_complexes) self.queue.finish()
def _ggr_spa_error(self,spa): # promote the available spins by value target*spa. store in the spa_buffer. bound spa_buffer. # calculate the new total magnetization for this spa value. difference of total and desired is the error function. self.ggr_promote_spins(self.domains,self.available,self.spa_buffer,self.target*spa) self.bound(self.spa_buffer,self.spa_buffer) buffer_average = (cla.sum(self.spa_buffer).get())/self.N2 e = abs(buffer_average-self.goal_m) #print " %.6e, %.3e"%(spa,e) return e
def sum(*args, **kwargs): a = args[0] if a.ndim==0 or not 'axis' in kwargs.keys(): res = clarray.sum(a, queue=queue) #np.sum(*args, **kwargs) if not isinstance(res, myclArray): res.__class__ = myclArray res.reinit() return res else: kwargs['prg2load'] = programs.sum return _sum(*args, **kwargs)
def alter_sum(): ctx = cl_init() queue = cl.CommandQueue(ctx) n = 10**6 a_gpu = cl_array.to_device(queue, np.random.randn(n).astype(np.float32)) b_gpu = cl_array.to_device(queue, np.random.randn(n).astype(np.float32)) cl_sum = cl_array.sum(a_gpu).get() numpy_sum = np.sum(a_gpu.get()) print cl_sum, numpy_sum
def test_sum(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand a_gpu = clrand(context, queue, (200000,), np.float32) a = a_gpu.get() sum_a = np.sum(a) sum_a_gpu = cl_array.sum(a_gpu).get() assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
def radon_normest(queue, r_struct): img = clarray.to_device( queue, require(random.randn(*r_struct[1]), float32, 'F')) sino = clarray.zeros(queue, r_struct[2], dtype=float32, order='F') V = (radon(sino, img, r_struct, wait_for=img.events)) for i in range(10): normsqr = float(clarray.sum(img).get()) img /= normsqr sino.add_event(radon(sino, img, r_struct, wait_for=img.events)) img.add_event(radon_ad(img, sino, r_struct, wait_for=sino.events)) return sqrt(normsqr)
def alter_sum(): ctx = cl_init() queue = cl.CommandQueue(ctx) n = 10**6 a_gpu = cl_array.to_device( queue, np.random.randn(n).astype(np.float32)) b_gpu = cl_array.to_device( queue, np.random.randn(n).astype(np.float32)) cl_sum = cl_array.sum(a_gpu).get() numpy_sum = np.sum(a_gpu.get()) print cl_sum, numpy_sum
def test_sum(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) n = 200000 for dtype in [np.float32, np.complex64]: a_gpu = general_clrand(queue, (n,), dtype) a = a_gpu.get() sum_a = np.sum(a) sum_a_gpu = cl_array.sum(a_gpu).get() assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
def test_outoforderqueue_reductions(ctx_factory): context = ctx_factory() try: queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") # 0/1 values to avoid accumulated rounding error a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32')) a[800000] = 10 # all<5 looks true until near the end a_gpu = cl_array.to_device(queue, a) b1 = cl_array.sum(a_gpu).get() b2 = cl_array.dot(a_gpu, 3 - a_gpu).get() b3 = (a_gpu < 5).all().get() assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0
def check_convergence(self): # calculate the difference of the previous domains (self.incoming) and the current domains (self.domains). self.array_diff(self.incoming,self.domains,self.domain_diff) # sum the difference array and divide by the area of the simulation as a metric of how much the two domains # configurations differ. self.power = (cla.sum(self.domain_diff).get())/self.N2 self.powerlist.append(self.power) # set the convergence condition if self.power > self.converged_at: self.converged = False if self.power <= self.converged_at: self.converged = True if 'converged' in self.returnables_list and self.converged: self.returnables['converged'] = self.domains.get()
def test_sum(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand a_gpu = clrand(context, queue, (200000, )) a = a_gpu.get() sum_a = numpy.sum(a) from pycuda.reduction import get_sum_kernel sum_a_gpu = cl_array.sum(a_gpu).get() assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
def test_sum(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand a_gpu = clrand(context, queue, (200000,)) a = a_gpu.get() sum_a = numpy.sum(a) from pycuda.reduction import get_sum_kernel sum_a_gpu = cl_array.sum(a_gpu).get() assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
def sum(q, a, axis=None, out=None, keepdims=False): if axis is None or a.ndim <= 1: out_shape = (1, ) * a.ndim if keepdims else () return clarray.sum(a).reshape(out_shape) if axis < 0: axis += 2 if axis > 1: raise ValueError('invalid axis') if a.flags.c_contiguous: m, n = a.shape lda = a.shape[1] transA = True if axis == 0 else False sum_axis, out_axis = (m, n) if axis == 0 else (n, m) else: n, m = a.shape lda = a.shape[0] transA = False if axis == 0 else True sum_axis, out_axis = (n, m) if axis == 0 else (m, n) ones = clarray.empty(q, (sum_axis, ), a.dtype).fill(1.0) if keepdims: out_shape = (1, out_axis) if axis == 0 else (out_axis, 1) else: out_shape = (out_axis, ) if out is None: out = clarray.zeros(q, out_shape, a.dtype) else: assert out.dtype == a.dtype assert out.size >= out_axis if a.dtype == np.float32: gemv = clblaswrap.sgemv elif a.dtype == np.float64: gemv = clblaswrap.dgemv else: raise TypeError('Unsupported array type: %s' % str(a.dtype)) alpha = 1.0 beta = 0.0 ev = gemv(q, transA, m, n, alpha, a, lda, ones, 1, beta, out, 1) ev.wait() return out
def find_contacts(self, predict=True): """Call the find_contacts kernel. Assumes that cell_centers, cell_dirs, cell_lens, cell_rads, cell_sqs, cell_dcenters, cell_dlens, cell_dangs, sorted_ids, and sq_inds are current on the device. Calculates cell_n_cts, ct_frs, ct_tos, ct_dists, ct_pts, ct_norms, ct_reldists, and n_cts. """ if predict: centers = self.pred_cell_centers_dev dirs = self.pred_cell_dirs_dev lens = self.pred_cell_lens_dev else: centers = self.cell_centers_dev dirs = self.cell_dirs_dev lens = self.cell_lens_dev self.program.find_plane_contacts( self.queue, (self.n_cells, ), None, numpy.int32(self.max_cells), numpy.int32(self.max_contacts), numpy.int32(self.n_planes), self.plane_pts_dev.data, self.plane_norms_dev.data, self.plane_coeffs_dev.data, centers.data, dirs.data, lens.data, self.cell_rads_dev.data, self.cell_n_cts_dev.data, self.ct_frs_dev.data, self.ct_tos_dev.data, self.ct_dists_dev.data, self.ct_pts_dev.data, self.ct_norms_dev.data, self.ct_reldists_dev.data, self.ct_stiff_dev.data).wait() self.program.find_contacts( self.queue, (self.n_cells, ), None, numpy.int32(self.max_cells), numpy.int32(self.n_cells), numpy.int32(self.grid_x_min), numpy.int32(self.grid_x_max), numpy.int32(self.grid_y_min), numpy.int32(self.grid_y_max), numpy.int32(self.n_sqs), numpy.int32(self.max_contacts), centers.data, dirs.data, lens.data, self.cell_rads_dev.data, self.cell_sqs_dev.data, self.sorted_ids_dev.data, self.sq_inds_dev.data, self.cell_n_cts_dev.data, self.ct_frs_dev.data, self.ct_tos_dev.data, self.ct_dists_dev.data, self.ct_pts_dev.data, self.ct_norms_dev.data, self.ct_reldists_dev.data, self.ct_stiff_dev.data, self.ct_overlap_dev.data).wait() # set dtype to int32 so we don't overflow the int32 when summing #self.n_cts = self.cell_n_cts_dev.get().sum(dtype=numpy.int32) self.n_cts = cl_array.sum(self.cell_n_cts_dev[0:self.n_cells]).get()
def test_sum(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) n = 200000 for dtype in [np.float32, np.complex64]: a_gpu = general_clrand(queue, (n, ), dtype) a = a_gpu.get() for slc in [ slice(None), slice(1000, 3000), slice(1000, -3000), slice(1000, None), ]: sum_a = np.sum(a[slc]) sum_a_gpu = cl_array.sum(a_gpu[slc]).get() assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
def _sum(a, axis=None, dtype=None, out=None, prg2load=programs.sum): #Transpose first to shift target axis to the end #do not transpose if axis already is the end if axis==None: res = clarray.sum(a, queue=queue) if not isinstance(res, myclArray): res.__class__ = myclArray res.reinit() return res olddims = np.array(a.shape, dtype=np.uint32) replaces = np.append(np.delete(np.arange(a.ndim), axis, 0), [axis], 0).astype(np.uint32) if axis != a.ndim-1: clolddims = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=olddims) clreplaces = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=replaces) cltrresult = cl.Buffer(ctx, mf.READ_WRITE, a.nbytes) program = programs.transpose(a.dtype, a.ndim) program.mitransp(queue, (a.size,), None, clolddims, clreplaces, a.data, cltrresult) else: cltrresult = a.data program = prg2load(a.dtype, a.shape[axis]) #Sum for last axis result = empty(tuple(olddims[replaces[:-1]]), a.dtype) program.misum(queue, (int(a.size//a.shape[axis]),), None, cltrresult, result.data) return result
def _gpu_search(self): """Method that actually performs the exhaustive search on the GPU""" # make shortcuts d = self.data g = self.gpu_data q = self.queue k = g['k'] # initalize the total number of sampled complexes tot_complexes = cl_array.sum(g['interspace'], dtype=np.float32) # initialize time time0 = _time() # loop over all rotations for n in xrange(g['nrot']): # rotate the scanning chain object k.rotate_image3d(q, g['sampler'], g['im_lsurf'], self.rotations[n], g['lsurf'], d['im_center']) # perform the FFTs and calculate the clashing and interaction volume k.rfftn(q, g['lsurf'], g['ft_lsurf']) k.c_conj_multiply(q, g['ft_lsurf'], g['ft_rcore'], g['ft_clashvol']) k.irfftn(q, g['ft_clashvol'], g['clashvol']) k.c_conj_multiply(q, g['ft_lsurf'], g['ft_rsurf'], g['ft_intervol']) k.irfftn(q, g['ft_intervol'], g['intervol']) # determine at every position if the conformation is a proper complex k.touch(q, g['clashvol'], g['max_clash'], g['intervol'], g['min_interaction'], g['interspace']) if self.distance_restraints: k.fill(q, g['restspace'], 0) # determine the space that is consistent with a number of # distance restraints k.distance_restraint(q, g['restraints'], self.rotations[n], g['restspace']) # get the accessible interaction space also consistent with a # certain number of distance restraints k.multiply(q, g['restspace'], g['interspace'], g['access_interspace']) # calculate the total number of complexes, while taking into # account orientational/rotational bias tot_complexes += cl_array.sum(g['interspace'], dtype=np.float32) * np.float32( self.weights[n]) # take at every position in space the maximum number of consistent # restraints for later visualization cl_array.maximum(g['best_access_interspace'], g['access_interspace'], g['best_access_interspace']) # calculate the number of accessable complexes consistent with # EXACTLY N distance restraints k.histogram(q, g['access_interspace'], g['subhists'], self.weights[n], d['nrestraints']) # Count the violations of each restraint for all complexes # consistent with EXACTLY N restraints k.count_violations(q, g['restraints'], self.rotations[n], g['access_interspace'], g['viol_counter'], self.weights[n]) # inform user if _stdout.isatty(): self._print_progress(n, g['nrot'], time0) # wait for calculations to finish self.queue.finish() # transfer the data from GPU to CPU # get the number of accessible complexes and reduce the subhistograms # to the final histogram access_complexes = g['subhists'].get().sum(axis=0) # account for the fact that we are counting the number of accessible # complexes consistent with EXACTLY N restraints access_complexes[0] = tot_complexes.get() - sum(access_complexes[1:]) d['accessible_complexes'] = access_complexes d['accessible_interaction_space'] = g['best_access_interspace'].get() # get the violation submatrices and reduce it to the final violation # matrix d['violations'] = g['viol_counter'].get().sum(axis=0)
def decode_OpenCL_belief_propagation(self, received_blocks,buffer_in=False,return_buffer=False): # Set up OpenCL if buffer_in: channel_values_buffer = received_blocks else: channel_values_buffer = cl_array.to_device(self.queue,received_blocks.astype(np.float64)) varnode_output_buffer = cl_array.empty(self.queue, received_blocks.shape, dtype=np.float64) self.send_prog(self.queue, received_blocks.shape, None, channel_values_buffer.data, self.inbox_memory_start_varnodes_buffer.data, self.degree_varnode_nr_buffer.data, self.target_memorycells_varnodes_buffer.data, self.checknode_inbox_buffer.data) self.queue.finish() syndrome_zero = False i_num = 1 while (i_num<self.imax) and (not syndrome_zero): local_size = None self.checknode_update_prog(self.queue, (self.degree_checknode_nr.shape[0], received_blocks[:,np.newaxis].shape[-1]), None, self.checknode_inbox_buffer.data, self.inbox_memory_start_checknodes_buffer.data, self.degree_checknode_nr_buffer.data, self.target_memorycells_checknodes_buffer.data, self.varnode_inbox_buffer.data) self.queue.finish() self.varnode_update_prog(self.queue, received_blocks.shape , None, channel_values_buffer.data, self.varnode_inbox_buffer.data, self.inbox_memory_start_varnodes_buffer.data, self.degree_varnode_nr_buffer.data, self.target_memorycells_varnodes_buffer.data, self.checknode_inbox_buffer.data) self.calc_syndrom_prog(self.queue, (self.degree_checknode_nr.shape[0], received_blocks[:,np.newaxis].shape[-1]), None, self.checknode_inbox_buffer.data, self.inbox_memory_start_checknodes_buffer.data, self.degree_checknode_nr_buffer.data, self.syndrom_buffer.data) if cl_array.sum(self.syndrom_buffer).get() == 0: syndrome_zero =True i_num += 1 self.varoutput_prog(self.queue, received_blocks.shape , None, channel_values_buffer.data, self.varnode_inbox_buffer.data, self.inbox_memory_start_varnodes_buffer.data, self.degree_varnode_nr_buffer.data, varnode_output_buffer.data) self.queue.finish() if return_buffer: return varnode_output_buffer else: output_values = varnode_output_buffer.get() return output_values
def sum_cl(queue, a, axis=None): """ Sum of GPUArray elements in a given axis direction or all elements. Parameters ---------- queue PyOpenCL queue. a : gpuarray GPUArray with elements to be operated on. axis : int Axis direction to sum through, all if None. Returns ------- gpuarray GPUArray sum. Notes ----- - This is temporary and not an efficient implementation. """ if axis is not None: m, n = a.shape kernel = cl.Program(queue.context, """ __kernel void sum0_cl(__global float *a, __global float *b, unsigned m, unsigned n) { int bid = get_group_id(0); int tid = get_local_id(1); int id = get_global_id(1) * n + get_global_id(0); int stride = 0; __local float sum[32000 / sizeof(float)]; sum[tid] = a[id]; sum[m] = 0.; for (stride = 1; stride < m; stride *= 2) { barrier(CLK_LOCAL_MEM_FENCE); if (tid % (2 * stride) == 0) { sum[tid] += sum[tid + stride]; } } b[bid] = sum[0]; } __kernel void sum1_cl(__global float *a, __global float *b, unsigned m, unsigned n) { int bid = get_group_id(1); int tid = get_local_id(0); int id = get_global_id(1) * n + get_global_id(0); int stride = 0; __local float sum[32000 / sizeof(float)]; sum[tid] = a[id]; sum[n] = 0.; for (stride = 1; stride < n; stride *= 2) { barrier(CLK_LOCAL_MEM_FENCE); if (tid % (2 * stride) == 0) { sum[tid] += sum[tid + stride]; } } b[bid] = sum[0]; } """).build() if axis == 0: b = cl_array.empty(queue, (1, n), dtype=float32) kernel.sum0_cl(queue, (n, m), (1, m), a.data, b.data, uint32(m), uint32(n)) elif axis == 1: b = cl_array.empty(queue, (m, 1), dtype=float32) kernel.sum1_cl(queue, (n, m), (n, 1), a.data, b.data, uint32(m), uint32(n)) return b else: return cl_array.sum(a)
def _evaluate(self, valuation, cache): if id(self) not in cache: op = self.ops[0]._evaluate(valuation, cache) cache[id(self)] = clarray.sum( op, dtype=np.dtype('float32')) / np.float32(op.size) return cache[id(self)]
def spec(filename, extra): cuantas = extra[0] OPEN_IMAGE = extra[1] if(OPEN_IMAGE==True): a = Image.open(filename) Nx, Ny = a.size else: # np array a = filename #Nx, Ny = a.shape Nx, Ny = a.size L = Nx*Ny points = [] # number of elements in the structure RESHAPE = extra[3] CONVERT = extra[2] if(CONVERT == True): #gray = a.convert('L') # rgb 2 gray #arr = np.array(gray.getdata()).astype(np.int32) arr = np.array(filename.getdata()).astype(np.int32) else: if(RESHAPE == True): # ARGHH arr = np.array(a).reshape(a.shape[0]*a.shape[1]) else: arr = a alphaIm = np.zeros((Nx,Ny), dtype=np.float32 ) # Nx rows x Ny columns l = 4 # (maximum window size-1) / 2 temp = map(lambda i: 2*i+1, range(l)) temp = np.log(temp) measure = np.zeros(l*Ny).astype(np.int32) b = np.vstack((temp,np.ones((1,l)))).T AA=coo_matrix(np.kron(np.identity(Ny), b)) prg = cl.Program(ctx, """ __kernel void measure(__global float *alphaIm, __global int *img, const int Nx, const int Ny, const int size) { int i = get_global_id(0); int j = get_global_id(1); // make histogram of region int hist[256]; int t; for(t = 0; t < 256; t++) hist[t] = 0; int xi = max(i-size,0); int yi = max(j-size,0); int xf = min(i+size,Nx-1); int yf = min(j+size,Ny-1); int u , v; for(int u = xi; u <= xf; u++) for(int v = yi; v <= yf; v++) hist[img[u*Ny+v]]++; float res = 0; int s; float total = (yf-yi)*(xf-xi); // size of region for(s = 0; s <= 255; s++) { float v = hist[s]/total; // probability res += v*log2(v+0.0001); } alphaIm[i*Ny+j] = res; } """).build() #d = measure.shape[0]/2 #ms = measure[0:l*d] img_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=arr) alphaIm_buf = cl.Buffer(ctx, mf.WRITE_ONLY, alphaIm.nbytes) sh = alphaIm.shape size = 8 # Window size prg.measure(queue, sh, None, alphaIm_buf, img_buf, np.int32(Nx), np.int32(Ny), np.int32(size)) cl.enqueue_read_buffer(queue,alphaIm_buf,alphaIm).wait() maxim = np.max(alphaIm) minim = np.min(alphaIm) #print maxim, minim import matplotlib from matplotlib import pyplot as plt # Alpha image #plt.imshow(alphaIm, cmap=matplotlib.cm.gray) #plt.show() #return paso = (maxim-minim)/cuantas if(paso <= 0): # the alpha image is monofractal clases = np.array(map(lambda i: i+minim,np.zeros(cuantas))).astype(np.float32) else: clases = np.arange(minim,maxim,paso).astype(np.float32) # Window cant = int(np.floor(np.log(Nx))) # concatenate the image A as [[A,A],[A,A]] hs = np.hstack((alphaIm,alphaIm)) alphaIm = np.vstack((hs,hs)) prg = cl.Program(ctx, """ __kernel void krnl(__global int *flag, __global float *clases, __global float* alphaIm,const int sizeBlocks, const int Ny, const int numBlocks_y, const int c, const int cuantas, float minim, float maxim) { int i = get_global_id(0); int j = get_global_id(1); int xi = i*sizeBlocks; int xf = (i+1)*sizeBlocks-1; int yi = j*sizeBlocks; int yf = (j+1)*sizeBlocks-1; // calculate max and min for this subregion float maxx; float minn; int w,t; int first = 0; for(w = xi; w < xf; w++) { for(t = yi; t < yf; t++) { float v = alphaIm[w*Ny*2 + t]; if (v >= clases[c] and v <= clases[c+1]) { if(!first) { first = 1; maxx = minn = v; } if(v > maxx) maxx = v; if(v < minn) minn = v; } } } float totalDif = maxim - minim; int nB = numBlocks_y; // num of subdivisions in the Z coordinate int l = floor(((maxx-minim)/totalDif)*nB)+1; int k = floor(((minn-minim)/totalDif)*nB)+1; flag[i*numBlocks_y + j] = l-k+1; } """).build() # Multifractal dimentions falpha = np.zeros(cuantas).astype(np.float32) clases_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=clases.astype(np.float32)) alphaIm_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=alphaIm.astype(np.float32)) for c in range(cuantas): N = np.zeros(cant+1) # window sizes for k in range(1,cant+2): sizeBlocks = 2*k+1 numBlocks_x = int(np.floor(Nx/sizeBlocks)) numBlocks_y = int(np.floor(Ny/sizeBlocks)) flag = np.zeros((numBlocks_x,numBlocks_y)).astype(np.int32) flag_buf = cl.Buffer(ctx, mf.WRITE_ONLY, flag.nbytes) sh = flag.shape prg.krnl(queue, sh, None, flag_buf, clases_buf, alphaIm_buf, np.int32(sizeBlocks), np.int32(Ny), np.int32(numBlocks_y), np.int32(c), np.int32(cuantas), np.float32(minim), np.float32(maxim)) cl.enqueue_read_buffer(queue, flag_buf, flag).wait() N[k-1] = cla.sum(cla.to_device(queue,flag)).get() #print N # Haussdorf (box) dimention of the alpha distribution falpha[c] = -np.polyfit(map(lambda i: np.log((2*i+1)),range(1,cant+2)),np.log(map(lambda i: i+1,N)),1)[0] s = np.hstack((clases,falpha)) return s
def one_iteration(self,iteration): # iterate through one cycle of the simulation. assert self.can_has_domains, "no domains set!" assert self.can_has_envelope, "no goal envelope set!" # first, copy the current state of the domain pattern to a holding buffer ("incoming") self.copy(self.domains,self.incoming) # now find the domain walls. modifications to the domain pattern due to rescaling only take place in the walls self.findwalls.execute(self.queue,(self.N,self.N),self.domains.data,self.allwalls.data,self.poswalls.data,self.negwalls.data,np.int32(self.N)) if 'walls1' in self.returnables_list: self.returnables['walls1'] = self.allwalls.get() if 'pos_walls1' in self.returnables_list: self.returnables['pos_walls1'] = self.poswalls.get() if 'neg_walls1' in self.returnables_list: self.returnables['neg_walls1'] = self.negwalls.get() # run the ising bias self.ising(self.domains,self.alpha) # rescale the domains. this operates on the class variables so no arguments are passed. self.domains stores the rescaled real-valued domains. the rescaled # domains are bounded to the range +1 -1. self._rescale_speckle() self.bound(self.domains,self.domains) if 'bounded' in self.returnables_list: self.returnables['bounded'] = self.domains.get() # if making an ordering island, this is the command that enforces the border condition #if use_boundary and n > boundary_turn_on: self.enforce_boundary(self.domains,self.boundary,self.boundary_values) # so now we have self.incoming (old domains) and self.domains (rescaled domains). we want to use self.walls to enforce changes to the domain pattern # from rescaling only within the walls. because updating can change wall location, also refind the walls. if self.only_walls: self.update_domains(self.domains,self.incoming,self.allwalls) self.findwalls.execute(self.queue,(self.N,self.N),self.domains.data,self.allwalls.data,self.poswalls.data,self.negwalls.data,np.int32(self.N)) if iteration > self.m_turnon: self.findwalls.execute(self.queue,(self.N,self.N),self.domains.data,self.allwalls.data,self.poswalls.data,self.negwalls.data,np.int32(self.N)) if 'walls2' in self.returnables_list: self.returnables['walls2'] = self.allwalls.get() if 'pos_walls2' in self.returnables_list: self.returnables['pos_walls2'] = self.poswalls.get() if 'neg_walls2' in self.returnables_list: self.returnables['neg_walls2'] = self.negwalls.get() # now attempt to adjust the net magnetization in real space to achieve the target magnetization. net_m = cla.sum(self.domains).get() needed_m = self.goal_m-net_m if needed_m > 0: self.make_available1(self.available,self.negwalls,self.negpins,self.pospins) sites = cla.sum(self.available).get() spa = min([self.spa_max,needed_m/sites]) if needed_m < 0: self.make_available1(self.available,self.poswalls,self.negpins,self.pospins) sites = cla.sum(self.available).get() spa = max([-1*self.spa_max,needed_m/sites]) self.promote_spins(self.domains,self.available,spa) if 'promoted' in self.returnables_list: self.returnables['promoted'] = self.domains.get() self.bound(self.domains,self.domains) if 'domains' in self.returnables_list: self.returnables['domains'] = self.domains.get()
# ge = p.globalError(4,4,x) # print ge[0] # # a = [] # # print numpy.sum(x[0]) # # print time.clock() - start import pyopencl as cl # Import the OpenCL GPU computing API import pyopencl.array as pycl_array # Import PyOpenCL Array (a Numpy array plus an OpenCL buffer object) import numpy as np # Import Numpy number tools context = cl.create_some_context() # Initialize the Context queue = cl.CommandQueue(context) # Instantiate a Queue x = pycl_array.to_device(queue, np.random.rand(3920, 100).astype(np.float32)) # a = pycl_array.to_device(queue, np.random.rand(50000).astype(np.float32)) # b = pycl_array.to_device(queue, np.random.rand(50000).astype(np.float32)) y = np.random.rand(3920, 100).astype(np.float32) # Create two random pyopencl arrays # c = pycl_array.empty_like(a) # Create an empty pyopencl destination array start = time.clock() d = [] for i in range(3920): d.append(pycl_array.sum(x[i])) # d = numpy.sum(y[i]) print time.clock() - start # print("x: {}".format(x)) # print("d: {}".format(d)) # Print all three arrays, to show sum() worked
def decode_OpenCL(self, received_blocks, buffer_in=False, return_buffer=False): # Set up OpenCL if buffer_in: channel_values_buffer = received_blocks else: channel_values_buffer = cl_array.to_device( self.queue, received_blocks.astype(np.int32)) varnode_output_buffer = cl_array.empty(self.queue, received_blocks.shape, dtype=np.int32) self.send_prog(self.queue, received_blocks.shape, None, channel_values_buffer.data, self.inbox_memory_start_varnodes_buffer.data, self.degree_varnode_nr_buffer.data, self.target_memorycells_varnodes_buffer.data, self.checknode_inbox_buffer.data) #self.queue.finish() self.first_iter_prog(self.queue, (self.degree_checknode_nr.shape[0], received_blocks[:, np.newaxis].shape[-1]), None, self.checknode_inbox_buffer.data, self.inbox_memory_start_checknodes_buffer.data, self.degree_checknode_nr_buffer.data, self.target_memorycells_checknodes_buffer.data, self.varnode_inbox_buffer.data, self.cardinality_T_channel, self.cardinality_T_decoder_ops, self.Trellis_checknode_vector_a_buffer.data) syndrome_zero = False i_num = 1 while (i_num < self.imax) and (not syndrome_zero): local_size = None #(1000, 1) self.varnode_update_prog( self.queue, received_blocks.shape, local_size, channel_values_buffer.data, self.varnode_inbox_buffer.data, self.inbox_memory_start_varnodes_buffer.data, self.degree_varnode_nr_buffer.data, self.target_memorycells_varnodes_buffer.data, self.checknode_inbox_buffer.data, self.cardinality_T_channel, self.cardinality_T_decoder_ops, i_num - 1, self.Trellis_varnode_vector_a_buffer.data) #self.queue.finish() self.checknode_update_prog( self.queue, (self.degree_checknode_nr.shape[0], received_blocks[:, np.newaxis].shape[-1]), None, self.checknode_inbox_buffer.data, self.inbox_memory_start_checknodes_buffer.data, self.degree_checknode_nr_buffer.data, self.target_memorycells_checknodes_buffer.data, self.varnode_inbox_buffer.data, self.cardinality_T_channel, self.cardinality_T_decoder_ops, i_num - 1, self.Trellis_checknode_vector_a_buffer.data) #self.queue.finish() self.calc_syndrom_prog( self.queue, (self.degree_checknode_nr.shape[0], received_blocks[:, np.newaxis].shape[-1]), None, self.checknode_inbox_buffer.data, self.inbox_memory_start_checknodes_buffer.data, self.degree_checknode_nr_buffer.data, self.cardinality_T_decoder_ops, self.syndrom_buffer.data) #self.queue.finish() if cl_array.sum(self.syndrom_buffer).get() == 0: syndrome_zero = True i_num += 1 self.varoutput_prog(self.queue, received_blocks.shape, None, channel_values_buffer.data, self.varnode_inbox_buffer.data, self.inbox_memory_start_varnodes_buffer.data, self.degree_varnode_nr_buffer.data, self.cardinality_T_channel, self.cardinality_T_decoder_ops, i_num - 1, self.Trellis_varnode_vector_a_buffer.data, varnode_output_buffer.data) self.queue.finish() if return_buffer: return varnode_output_buffer else: pass output_values = varnode_output_buffer.get() return output_values
def _gpu_init(self): q = self.queue # Move arrays to GPU self._cl_rcore = cl_array.to_device(q, self._rcore.astype(np.float32)) self._cl_rsurf = cl_array.to_device(q, self._rsurf.astype(np.float32)) self._cl_lcore = cl_array.to_device(q, self._lcore.astype(np.float32)) # Make the rotations float16 arrays self._cl_rotations = np.zeros((self.rotations.shape[0], 16), dtype=np.float32) self._cl_rotations[:, :9] = self.rotations.reshape(-1, 9) # Allocate arrays # Float32 self._cl_shape = tuple(self._shape) arr_names = 'rot_lcore clashvol intervol tmp'.split() for arr_name in arr_names: setattr(self, '_cl_' + arr_name, cl_array.zeros(q, self._cl_shape, dtype=np.float32) ) # Int32 arr_names = 'interspace red_interspace restspace access_interspace'.split() for arr_name in arr_names: setattr(self, '_cl_' + arr_name, cl_array.zeros(q, self._cl_shape, dtype=np.int32) ) # Boolean arr_names = 'not_clashing interacting'.split() for arr_name in arr_names: setattr(self, '_cl_' + arr_name, cl_array.zeros(q, self._cl_shape, dtype=np.int32) ) # Complex64 self._ft_shape = tuple([self._shape[0] // 2 + 1] + list(self._shape)[1:]) arr_names = 'lcore lcore_conj rcore rsurf tmp'.split() for arr_name in arr_names: setattr(self, '_cl_ft_' + arr_name, cl_array.empty(q, self._ft_shape, dtype=np.complex64) ) # Restraints arrays self._cl_rrestraints = np.zeros((self._nrestraints, 4), dtype=np.float32) self._cl_rrestraints[:, :3] = self._rrestraints self._cl_rrestraints = cl_array.to_device(q, self._cl_rrestraints) self._cl_lrestraints = np.zeros((self._nrestraints, 4), dtype=np.float32) self._cl_lrestraints[:, :3] = self._lrestraints self._cl_lrestraints = cl_array.to_device(q, self._cl_lrestraints) self._cl_mindis = cl_array.to_device(q, self._mindis.astype(np.float32)) self._cl_maxdis = cl_array.to_device(q, self._maxdis.astype(np.float32)) self._cl_mindis2 = cl_array.to_device(q, self._mindis.astype(np.float32) ** 2) self._cl_maxdis2 = cl_array.to_device(q, self._maxdis.astype(np.float32) ** 2) self._cl_rot_lrestraints = cl_array.zeros_like(self._cl_rrestraints) self._cl_restraints_center = cl_array.zeros_like(self._cl_rrestraints) # kernels self._kernel_constants = {'interaction_cutoff': 10, 'nrestraints': self._nrestraints, 'shape_x': self._shape[2], 'shape_y': self._shape[1], 'shape_z': self._shape[0], 'llength': self._llength, 'nreceptor_coor': 0, 'nligand_coor': 0, } # Counting arrays self._cl_hist = cl_array.zeros(self.queue, self._nrestraints, dtype=np.int32) self._cl_consistent_complexes = cl_array.zeros(self.queue, self._nrestraints, dtype=np.float32) self._cl_viol_hist = cl_array.zeros(self.queue, (self._nrestraints, self._nrestraints), dtype=np.int32) self._cl_violations = cl_array.zeros(self.queue, (self._nrestraints, self._nrestraints), dtype=np.float32) # Conversions self._cl_grid_max_clash = np.float32(self._grid_max_clash) self._cl_grid_min_interaction = np.float32(self._grid_min_interaction) self._CL_ZERO = np.int32(0) # Occupancy analysis self._cl_occ_grid = {} if self.occupancy_analysis: for i in xrange(self.interaction_restraints_cutoff, self._nrestraints + 1): self._cl_occ_grid[i] = cl_array.zeros(self.queue, self._cl_shape, dtype=np.float32) # Interaction analysis if self._interaction_analysis: shape = (self._lselect.shape[0], self._rselect.shape[0]) self._cl_interaction_hist = cl_array.zeros(self.queue, shape, dtype=np.int32) self._cl_interaction_matrix = {} for i in xrange(self._nrestraints + 1 - self.interaction_restraints_cutoff): self._cl_interaction_matrix[i] = cl_array.zeros(self.queue, shape, dtype=np.float32) # Coordinate arrays self._cl_rselect = np.zeros((self._rselect.shape[0], 4), dtype=np.float32) self._cl_rselect[:, :3] = self._rselect self._cl_rselect = cl_array.to_device(q, self._cl_rselect) self._cl_lselect = np.zeros((self._lselect.shape[0], 4), dtype=np.float32) self._cl_lselect[:, :3] = self._lselect self._cl_lselect = cl_array.to_device(q, self._cl_lselect) self._cl_rot_lselect = cl_array.zeros_like(self._cl_lselect) # Update kernel constants self._kernel_constants['nreceptor_coor'] = self._cl_rselect.shape[0] self._kernel_constants['nligand_coor'] = self._cl_lselect.shape[0] self._cl_kernels = Kernels(q.context, self._kernel_constants) self._cl_rfftn = pyclfft.RFFTn(q.context, self._shape) self._cl_irfftn = pyclfft.iRFFTn(q.context, self._shape) # Initial calculations self._cl_rfftn(q, self._cl_rcore, self._cl_ft_rcore) self._cl_rfftn(q, self._cl_rsurf, self._cl_ft_rsurf) self._cl_tot_complex = cl_array.sum(self._cl_interspace, dtype=np.dtype(np.float32))
def _sum_OCL(a, dtype=None, queue=None, slice=None): return cl_array.sum(a=a, dtype=dtype, queue=queue, slice=slice).get(queue=queue)
def find_contacts(self, predict=True): """Call the find_contacts kernel. Assumes that cell_centers, cell_dirs, cell_lens, cell_rads, cell_sqs, cell_dcenters, cell_dlens, cell_dangs, sorted_ids, and sq_inds are current on the device. Calculates cell_n_cts, ct_frs, ct_tos, ct_dists, ct_pts, ct_norms, ct_reldists, and n_cts. """ if predict: centers = self.pred_cell_centers_dev dirs = self.pred_cell_dirs_dev lens = self.pred_cell_lens_dev else: centers = self.cell_centers_dev dirs = self.cell_dirs_dev lens = self.cell_lens_dev self.program.find_plane_contacts(self.queue, (self.n_cells,), None, numpy.int32(self.max_cells), numpy.int32(self.max_contacts), numpy.int32(self.n_planes), self.plane_pts_dev.data, self.plane_norms_dev.data, self.plane_coeffs_dev.data, centers.data, dirs.data, lens.data, self.cell_rads_dev.data, self.cell_n_cts_dev.data, self.ct_frs_dev.data, self.ct_tos_dev.data, self.ct_dists_dev.data, self.ct_pts_dev.data, self.ct_norms_dev.data, self.ct_reldists_dev.data, self.ct_stiff_dev.data).wait() self.program.find_contacts(self.queue, (self.n_cells,), None, numpy.int32(self.max_cells), numpy.int32(self.n_cells), numpy.int32(self.grid_x_min), numpy.int32(self.grid_x_max), numpy.int32(self.grid_y_min), numpy.int32(self.grid_y_max), numpy.int32(self.n_sqs), numpy.int32(self.max_contacts), centers.data, dirs.data, lens.data, self.cell_rads_dev.data, self.cell_sqs_dev.data, self.sorted_ids_dev.data, self.sq_inds_dev.data, self.cell_n_cts_dev.data, self.ct_frs_dev.data, self.ct_tos_dev.data, self.ct_dists_dev.data, self.ct_pts_dev.data, self.ct_norms_dev.data, self.ct_reldists_dev.data, self.ct_stiff_dev.data).wait() # set dtype to int32 so we don't overflow the int32 when summing #self.n_cts = self.cell_n_cts_dev.get().sum(dtype=numpy.int32) self.n_cts = cl_array.sum(self.cell_n_cts_dev).get()
def test_spectral_poisson(ctx_factory, grid_shape, proc_shape, h, dtype, timing=False): if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape) rank_shape, _ = mpi.get_rank_shape_start(grid_shape) fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype) L = (3, 5, 7) dx = tuple(Li / Ni for Li, Ni in zip(L, grid_shape)) dk = tuple(2 * np.pi / Li for Li in L) if h == 0: def get_evals_2(k, dx): return - k**2 derivs = ps.SpectralCollocator(fft, dk) else: from pystella.derivs import SecondCenteredDifference get_evals_2 = SecondCenteredDifference(h).get_eigenvalues derivs = ps.FiniteDifferencer(mpi, h, dx, stream=False) solver = ps.SpectralPoissonSolver(fft, dk, dx, get_evals_2) pencil_shape = tuple(ni + 2*h for ni in rank_shape) statistics = ps.FieldStatistics(mpi, 0, rank_shape=rank_shape, grid_size=np.product(grid_shape)) fx = cla.empty(queue, pencil_shape, dtype) rho = clr.rand(queue, rank_shape, dtype) rho -= statistics(rho)["mean"] lap = cla.empty(queue, rank_shape, dtype) rho_h = rho.get() for m_squared in (0, 1.2, 19.2): solver(queue, fx, rho, m_squared=m_squared) fx_h = fx.get() if h > 0: fx_h = fx_h[h:-h, h:-h, h:-h] derivs(queue, fx=fx, lap=lap) diff = np.fabs(lap.get() - rho_h - m_squared * fx_h) max_err = np.max(diff) / cla.max(clm.fabs(rho)) avg_err = np.sum(diff) / cla.sum(clm.fabs(rho)) max_rtol = 1e-12 if dtype == np.float64 else 1e-4 avg_rtol = 1e-13 if dtype == np.float64 else 1e-5 assert max_err < max_rtol and avg_err < avg_rtol, \ f"solution inaccurate for {h=}, {grid_shape=}, {proc_shape=}" if timing: from common import timer time = timer(lambda: solver(queue, fx, rho, m_squared=m_squared), ntime=10) if mpi.rank == 0: print(f"poisson took {time:.3f} ms for {grid_shape=}, {proc_shape=}")
def pyopencl_mean(x_gpu_in): return cl_array.sum(x_gpu_in) / float(x_gpu_in.size)
def spec(filename, extra): cuantas = extra[0] OPEN_IMAGE = extra[1] if(OPEN_IMAGE==True): a = Image.open(filename) Nx, Ny = a.size else: # np array a = filename #Nx, Ny = a.shape Nx, Ny = a.size L = Nx*Ny points = [] # number of elements in the structure RESHAPE = extra[3] CONVERT = extra[2] if(CONVERT == True): gray = a.convert('L') # rgb 2 gray arr = np.array(gray.getdata()).astype(np.int32) else: if(RESHAPE == True): # ARGHH arr = np.array(a).reshape(a.shape[0]*a.shape[1]) else: arr = a alphaIm = np.zeros((Nx,Ny), dtype=np.double ) # Nx rows x Ny columns l = 4 # (maximum window size-1) / 2 temp = map(lambda i: 2*i+1, range(l)) temp = np.log(temp) measure = np.zeros(l*Ny).astype(np.int32) b = np.vstack((temp,np.ones((1,l)))).T AA=coo_matrix(np.kron(np.identity(Ny), b)) # which: which measure to take which = extra[4] prg = cl.Program(ctx, """ int maxx(__global int *img, int x1, int y1, int x2, int y2, const int Ny) { int i, j; int maxim = 0; for(i = x1; i < x2; i++) for(j = y1; j < y2; j++) if(img[i*Ny + j] > maxim) maxim = img[i*Ny + j]; return maxim; } int minn(__global int *img, int x1, int y1, int x2, int y2, const int Ny) { int i, j; int minim = 255; for(i = x1; i < x2; i++) for(j = y1; j < y2; j++) if(img[i*Ny + j] < minim) minim = img[i*Ny + j]; return minim; } int summ(__global int *img, int x1, int y1, int x2, int y2, const int Ny) { int i, j; int summ = 0; for(i = x1; i < x2; i++) for(j = y1; j < y2; j++) summ += img[i*Ny + j]; return summ; } int iso(__global int *img, int x1, int y1, int x2, int y2, const int Ny, const int x, const int y) { int i, j; int cant = 0; for(i = x1; i < x2; i++) for(j = y1; j < y2; j++) if(img[i*Ny + j] == img[x*Ny + y]) cant++; return cant; } __kernel void measure(__global int *dest, __global int *img, const int Nx, const int Ny, const int l, int i, const int d, const int which) { int j = get_global_id(0); int jim = (int)(j/l)+d; if(which == 0) dest[j] = maxx(img,max(i-((j%l)+1),0),max(jim-((j%l)+1),0), min(i+(j%l)+1,Nx-1),min(jim+(j%l)+1,Ny-1), Ny) + 1; if(which == 1) dest[j] = minn(img,max(i-((j%l)+1),0),max(jim-((j%l)+1),0), min(i+(j%l)+1,Nx-1),min(jim+(j%l)+1,Ny-1), Ny) + 1; if(which == 2) dest[j] = summ(img,max(i-((j%l)+1),0),max(jim-((j%l)+1),0), min(i+(j%l)+1,Nx-1),min(jim+(j%l)+1,Ny-1), Ny) + 1; if(which == 3) dest[j] = iso(img,max(i-((j%l)+1),0),max(jim-((j%l)+1),0), min(i+(j%l)+1,Nx-1),min(jim+(j%l)+1,Ny-1), Ny, i, j) + 1; } """).build() d = measure.shape[0]/2 ms = measure[0:l*d] img_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=arr) dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, ms.nbytes) sh = ms.shape for i in range(Nx): prg.measure(queue, sh, None, dest_buf, img_buf, np.int32(Nx), np.int32(Ny), np.int32(l), np.int32(i), np.int32(0), np.int32(which)) cl.enqueue_read_buffer(queue, dest_buf, measure[0:l*d]).wait() prg.measure(queue, sh, None, dest_buf, img_buf, np.int32(Nx), np.int32(Ny), np.int32(l), np.int32(i), np.int32(d), np.int32(which)) cl.enqueue_read_buffer(queue, dest_buf, measure[l*d:]).wait() # Instead of doing polyfits, a sparse linear system is constructed and solved bb=np.log(measure) z = linsolve.lsqr(AA,bb)[0] z = z.reshape(2,Ny,order = 'F') alphaIm[i] = z[0] maxim = np.max(alphaIm) minim = np.min(alphaIm) import matplotlib from matplotlib import pyplot as plt # Alpha image #plt.imshow(alphaIm, cmap=matplotlib.cm.gray) #plt.show() #return paso = (maxim-minim)/cuantas if(paso <= 0): # the alpha image is monofractal clases = np.array(map(lambda i: i+minim,np.zeros(cuantas))).astype(np.float32) else: clases = np.arange(minim,maxim,paso).astype(np.float32) # Window cant = int(np.floor(np.log(Nx))) # concatenate the image A as [[A,A],[A,A]] hs = np.hstack((alphaIm,alphaIm)) alphaIm = np.vstack((hs,hs)) prg = cl.Program(ctx, """ __kernel void krnl(__global int *flag, __global float *clases, __global float* alphaIm,const int sizeBlocks, const int Ny, const int numBlocks_y, const int c, const int cuantas) { int i = get_global_id(0); int j = get_global_id(1); int xi = i*sizeBlocks; int xf = (i+1)*sizeBlocks-1; int yi = j*sizeBlocks; int yf = (j+1)*sizeBlocks-1; if(xf == xi) xf = xf+1; if(yf == yi) yf = yf+1; int f = 0; int s1 = xf-xi; int s2 = yf-yi; if(c != cuantas-1) { // f = 1 if any pixel in block is between clases[c] and clases[c+1] int w, t; for(w = xi; w < xf; w++) { for(t = yi; t < yf; t++) { float b = alphaIm[w*Ny*2 + t]; if (b >= clases[c] and b < clases[c+1]) { f = 1; break; } } if(f == 1) break; } } else { // f = 1 if any pixel in block is equal to classes[c] int w, t; for(w = xi; w < xf; w++) { for(t = yi; t < yf; t++) { float b = alphaIm[w*Ny*2 + t]; if (b == clases[c]) { // !! f = 1; break; } } if(f == 1) break; } } flag[i*numBlocks_y + j] = f; } """).build() # Multifractal dimentions falpha = np.zeros(cuantas).astype(np.float32) clases_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=clases.astype(np.float32)) alphaIm_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=alphaIm.astype(np.float32)) for c in range(cuantas): N = np.zeros(cant+1) # window sizes for k in range(cant+1): sizeBlocks = 2*k+1 numBlocks_x = int(np.ceil(Nx/sizeBlocks)) numBlocks_y = int(np.ceil(Ny/sizeBlocks)) flag = np.zeros((numBlocks_x,numBlocks_y)).astype(np.int32) flag_buf = cl.Buffer(ctx, mf.WRITE_ONLY, flag.nbytes) sh = flag.shape prg.krnl(queue, sh, None, flag_buf, clases_buf, alphaIm_buf, np.int32(sizeBlocks), np.int32(Ny), np.int32(numBlocks_y), np.int32(c), np.int32(cuantas)) cl.enqueue_read_buffer(queue, flag_buf, flag).wait() N[k] = cla.sum(cla.to_device(queue,flag)).get() # Haussdorf (box) dimention of the alpha distribution falpha[c] = -np.polyfit(map(lambda i: np.log(i*2+1),range(cant+1)),np.log(map(lambda i: i+1,N)),1)[0] s = np.hstack((clases,falpha)) return s
def _gpu_search(self): """Method that actually performs the exhaustive search on the GPU""" # make shortcuts d = self.data g = self.gpu_data q = self.queue k = g['k'] # initalize the total number of sampled complexes tot_complexes = cl_array.sum(g['interspace'], dtype=np.float32) # initialize time time0 = _time() # loop over all rotations for n in xrange(g['nrot']): # rotate the scanning chain object k.rotate_image3d(q, g['sampler'], g['im_lsurf'], self.rotations[n], g['lsurf'], d['im_center']) # perform the FFTs and calculate the clashing and interaction volume k.rfftn(q, g['lsurf'], g['ft_lsurf']) k.c_conj_multiply(q, g['ft_lsurf'], g['ft_rcore'], g['ft_clashvol']) k.irfftn(q, g['ft_clashvol'], g['clashvol']) k.c_conj_multiply(q, g['ft_lsurf'], g['ft_rsurf'], g['ft_intervol']) k.irfftn(q, g['ft_intervol'], g['intervol']) # determine at every position if the conformation is a proper complex k.touch(q, g['clashvol'], g['max_clash'], g['intervol'], g['min_interaction'], g['interspace']) if self.distance_restraints: k.fill(q, g['restspace'], 0) # determine the space that is consistent with a number of # distance restraints k.distance_restraint(q, g['restraints'], self.rotations[n], g['restspace']) # get the accessible interaction space also consistent with a # certain number of distance restraints k.multiply(q, g['restspace'], g['interspace'], g['access_interspace']) # calculate the total number of complexes, while taking into # account orientational/rotational bias tot_complexes += cl_array.sum(g['interspace'], dtype=np.float32)*np.float32(self.weights[n]) # take at every position in space the maximum number of consistent # restraints for later visualization cl_array.maximum(g['best_access_interspace'], g['access_interspace'], g['best_access_interspace']) # calculate the number of accessable complexes consistent with # EXACTLY N distance restraints k.histogram(q, g['access_interspace'], g['subhists'], self.weights[n], d['nrestraints']) # Count the violations of each restraint for all complexes # consistent with EXACTLY N restraints k.count_violations(q, g['restraints'], self.rotations[n], g['access_interspace'], g['viol_counter'], self.weights[n]) # inform user if _stdout.isatty(): self._print_progress(n, g['nrot'], time0) # wait for calculations to finish self.queue.finish() # transfer the data from GPU to CPU # get the number of accessible complexes and reduce the subhistograms # to the final histogram access_complexes = g['subhists'].get().sum(axis=0) # account for the fact that we are counting the number of accessible # complexes consistent with EXACTLY N restraints access_complexes[0] = tot_complexes.get() - sum(access_complexes[1:]) d['accessible_complexes'] = access_complexes d['accessible_interaction_space'] = g['best_access_interspace'].get() # get the violation submatrices and reduce it to the final violation # matrix d['violations'] = g['viol_counter'].get().sum(axis=0)
def sum(self, a, dtype=None): import pyopencl.array as cl_array return cl_array.sum( a, dtype=dtype, queue=self._array_context.queue).get()[()]
def ggr_iteration(self,iteration): assert self.can_has_domains, "no domains set!" assert self.can_has_envelope, "no goal envelope set!" assert self.can_has_ggr, "must set ggr before running ggr_iteration" # get the target net magnetization net_m = cla.sum(self.domains).get() self.goal_m = self.plan[iteration+1] needed_m = self.goal_m*self.N2-net_m self.target = np.sign(needed_m).astype(np.float32) # copy the current domain pattern to self.incoming self.copy(self.domains,self.incoming) # find the domain walls. these get used in self.make_available. make the correct sites available for modification self.findwalls.execute(self.queue,(self.N,self.N),self.domains.data,self.allwalls.data,self.poswalls.data,self.negwalls.data,np.int32(self.N)) self.make_available1(self.available,self.allwalls,self.negpins,self.pospins) #if net_m > self.goal_m: self.make_available1(self.available,self.poswalls,self.negpins,self.pospins) #if net_m < self.goal_m: self.make_available1(self.available,self.negwalls,self.negpins,self.pospins) # run the ising bias self.ising(self.domains,self.alpha) # rescale the domains. this operates on the class variables so no arguments are passed. self.domains stores the rescaled real-valued domains. # the rescaled domains are bounded to the range +1 -1. # change in the domain pattern is allowed to happen only in the walls. # enforce the recency condition to prevent domain splittings (basically, make it hard to revert changes from long ago) self._rescale_speckle() self.bound(self.domains,self.domains) self.only_in_walls(self.domains,self.incoming,self.available) self.recency.execute(self.queue,(self.N2,), self.whenflipped.data,self.domains.data,self.incoming.data, self.recency_need.data,self.target,np.int32(iteration)).wait() # since the domains have been updated, refind the walls self.findwalls.execute(self.queue,(self.N,self.N),self.domains.data,self.allwalls.data,self.poswalls.data,self.negwalls.data,np.int32(self.N)) # now adjust the magnetization so that it reaches the target net_m = cla.sum(self.domains).get() needed_m = self.goal_m*self.N2-net_m self.target = np.sign(needed_m).astype(np.float32) if net_m > 0: self.make_available2(self.available,self.poswalls,self.domains,self.target) if net_m < 0: self.make_available2(self.available,self.negwalls,self.domains,self.target) # now we need to run an optimizer to find the correct value for spa. this should result in an # update to the class variable self.optimized_spa. optimized_spa is a class variable because spa # changes slowly so using the old value as the starting point in the optimization gives a speed up. opt_out = fminbound(self._ggr_spa_error,-1,1,full_output=1) self.optimized_spa = opt_out[0] # use the optimized spa value to actually promote the spins in self.domains self.ggr_promote_spins(self.domains,self.available,self.domains,self.target*self.optimized_spa) self.bound(self.domains,self.domains) m_out = (cla.sum(self.domains).get())/self.N2 # update the whenflipped data, which records the iteration when each pixel changed sign self.update_whenflipped.execute(self.queue,(self.N2,),self.whenflipped.data,self.domains.data,self.incoming.data,np.int32(iteration)) print "%.4d, %.3f, %.3f, %.3f"%(iteration, self.goal_m, m_out, self.optimized_spa) self.ggr_tracker[iteration] = iteration, self.optimized_spa, m_out # set a flag to save output. it is better to check m_out than to trust that the # self.goal_m, asked for in self.plan, has actually been achieved if m_out < self.next_crossing: print "***" self.checkpoint = True self.crossed = self.next_crossing try: self.crossings = self.crossings[:-1] self.next_crossing = self.crossings[-1] except IndexError: self.next_crossing = 0.01 else: self.checkpoint = False