def __init__(self, module, use_constant_memory): self.module = module self.use_constant_memory = use_constant_memory self.transform_length = 512 self.elem_dtype = numpy.dtype('complex128') self.elem_ctype = dtypes.ctype(self.elem_dtype) self.polynomial_length = 1024 self.polynomial_dtype = numpy.int32 self.polynomial_ctype = dtypes.ctype(self.polynomial_dtype) self.threads_per_transform = 64 self.temp_dtype = numpy.dtype('float64') self.temp_ctype = dtypes.ctype(self.temp_dtype) self.temp_length = 576 twd_fw = numpy.empty((8, 64), numpy.complex128) twd_inv = numpy.empty((8, 64), numpy.complex128) for i in range(8): for elem_id in range(64): twd_fw[i, elem_id] = numpy.exp( -2j * numpy.pi / self.transform_length * i * elem_id) twd_inv[i, elem_id] = numpy.exp( 2j * numpy.pi / self.transform_length * i * elem_id) idxs = numpy.arange(self.polynomial_length // 2) coeffs = numpy.exp(-2j * numpy.pi * idxs / self.polynomial_length / 2) self.cdata_fw = numpy.concatenate([twd_fw.flatten(), coeffs]) self.cdata_inv = numpy.concatenate( [twd_inv.flatten(), coeffs / self.transform_length]) self.cdata_fw_ctype = dtypes.ctype(self.cdata_fw.dtype) self.cdata_inv_ctype = dtypes.ctype(self.cdata_inv.dtype)
def nonlinear_no_potential(dtype, U, nu): c_dtype = dtype c_ctype = dtypes.ctype(c_dtype) s_dtype = dtypes.real_for(dtype) s_ctype = dtypes.ctype(s_dtype) return Module.create( """ %for comp in (0, 1): INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}( ${c_ctype} psi0, ${c_ctype} psi1, ${s_ctype} t) { return ( ${mul}(psi${comp}, ( ${dtypes.c_constant(U[comp, 0])} * ${norm}(psi0) + ${dtypes.c_constant(U[comp, 1])} * ${norm}(psi1) )) - ${mul}(psi${1 - comp}, ${nu}) ); } %endfor """, render_kwds=dict( mul=functions.mul(c_dtype, s_dtype), norm=functions.norm(c_dtype), U=U, nu=dtypes.c_constant(nu, s_dtype), c_ctype=c_ctype, s_ctype=s_ctype))
def get_nonlinear_wrapper(components, c_dtype, nonlinear_module, dt): s_dtype = dtypes.real_for(c_dtype) return Module.create( """ %for comp in range(components): INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}( %for pcomp in range(components): ${c_ctype} psi${pcomp}, %endfor ${s_ctype} V, ${s_ctype} t) { ${c_ctype} nonlinear = ${nonlinear}${comp}( %for pcomp in range(components): psi${pcomp}, %endfor V, t); return ${mul}( COMPLEX_CTR(${c_ctype})(0, -${dt}), nonlinear); } %endfor """, render_kwds=dict( components=components, c_ctype=dtypes.ctype(c_dtype), s_ctype=dtypes.ctype(s_dtype), mul=functions.mul(c_dtype, c_dtype), dt=dtypes.c_constant(dt, s_dtype), nonlinear=nonlinear_module))
def kspacegaussian_filter_CL(ksp, sigma): from reikna import cluda from reikna.cluda import functions, dtypes sz = np.array(ksp.shape) dtype = np.complex64 ftype = np.float32 api = cluda.ocl_api() thr = api.Thread.create() FACTOR = 1.0 program = thr.compile(""" KERNEL void gauss_kernel( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const ${ultype} x = (${ultype}) get_global_id(0); const SIZE_T dim1= %d; const SIZE_T dim2= %d; const SIZE_T dim3= %d; ${ftype} sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; ${ftype} factor = %f; const double TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; const ${ftype} SQRT2PI = 2.5066282746; const double CUBEDSQRT2PI = 15.749609945722419; const ${ultype} idx = x; ${ftype} i = (${ftype})((x / dim3) / dim2); i = (i - (${ftype})floor((${ftype})(dim1)/2.0))/(${ftype})(dim1); ${ftype} j = (${ftype})(x / dim3); if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); //Account for large global index (stored as ulong) before performing modulus double pre_k=fmod((double)(x) , (double) dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight; dest[idx].y = src[idx].y * weight; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype), ultype=dtypes.ctype(np.uint64), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel data_dev = thr.empty_like(ksp) gauss_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2]) ksp_out = data_dev.get() ifft = FFT(data_dev) cifft = ifft.compile(thr) cifft(data_dev, data_dev, inverse=0) result = np.fft.fftshift(data_dev.get() / sz[0] * sz[1] * sz[2]) result = result[::-1, ::-1, ::-1] result = np.roll(np.roll(np.roll(result, 1, axis=2), 1, axis=1), 1, axis=0) return ksp_out
def get_nonlinear_wrapper(state_dtype, grid_dims, drift, diffusion=None): real_dtype = dtypes.real_for(state_dtype) if diffusion is not None: noise_dtype = diffusion.dtype else: noise_dtype = real_dtype return Module.create( """ <% components = drift.components idx_args = ["idx_" + str(dim) for dim in range(grid_dims)] psi_args = ["psi_" + str(comp) for comp in range(components)] if diffusion is not None: dW_args = ["dW_" + str(ncomp) for ncomp in range(diffusion.noise_sources)] %> %for comp in range(components): INLINE WITHIN_KERNEL ${s_ctype} ${prefix}${comp}( %for idx in idx_args: const int ${idx}, %endfor %for psi in psi_args: const ${s_ctype} ${psi}, %endfor %if diffusion is not None: %for dW in dW_args: const ${n_ctype} ${dW}, %endfor %endif const ${r_ctype} t, const ${r_ctype} dt) { return ${mul_sr}(${drift.module}${comp}( ${", ".join(idx_args)}, ${", ".join(psi_args)}, t), dt) %if diffusion is not None: %for ncomp in range(diffusion.noise_sources): + ${mul_sn}(${diffusion.module}${comp}_${ncomp}( ${", ".join(idx_args)}, ${", ".join(psi_args)}, t), ${dW_args[ncomp]}) %endfor %endif ; } %endfor """, render_kwds=dict( grid_dims=grid_dims, s_ctype=dtypes.ctype(state_dtype), r_ctype=dtypes.ctype(real_dtype), n_ctype=dtypes.ctype(noise_dtype), mul_sr=functions.mul(state_dtype, real_dtype), mul_sn=functions.mul(state_dtype, noise_dtype), drift=drift, diffusion=diffusion))
def fft512(use_constant_memory=False): module = Module(TEMPLATE.get_def('fft512'), render_kwds=dict( elem_ctype=dtypes.ctype(numpy.complex128), temp_ctype=dtypes.ctype(numpy.float64), cdata_ctype=dtypes.ctype(numpy.complex128), polar_unit=functions.polar_unit(numpy.float64), mul=functions.mul(numpy.complex128, numpy.complex128), use_constant_memory=use_constant_memory, )) return FFT512(module, use_constant_memory)
def get_nonlinear(dtype, interaction, tunneling): r""" Nonlinear module .. math:: N(\psi_1, ... \psi_C) = \sum_{n=1}^{C} U_{jn} |\psi_n|^2 \psi_j - \nu_j psi_{m_j} ``interaction``: a symmetrical ``components x components`` array with interaction strengths. ``tunneling``: a list of (other_comp, coeff) pairs of tunnelling strengths. """ c_dtype = dtype c_ctype = dtypes.ctype(c_dtype) s_dtype = dtypes.real_for(dtype) s_ctype = dtypes.ctype(s_dtype) return Module.create( """ %for comp in range(components): INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}( %for pcomp in range(components): ${c_ctype} psi${pcomp}, %endfor ${s_ctype} V, ${s_ctype} t) { return ( ${mul}(psi${comp}, ( %for other_comp in range(components): + ${dtypes.c_constant(interaction[comp, other_comp], s_dtype)} * ${norm}(psi${other_comp}) %endfor + V )) - ${mul}( psi${tunneling[comp][0]}, ${dtypes.c_constant(tunneling[comp][1], s_dtype)}) ); } %endfor """, render_kwds=dict( components=interaction.shape[0], mul=functions.mul(c_dtype, s_dtype), norm=functions.norm(c_dtype), interaction=interaction, tunneling=tunneling, s_dtype=s_dtype, c_ctype=c_ctype, s_ctype=s_ctype))
def uniform_integer(bijection, dtype, low, high=None): """ Generates uniformly distributed integer numbers in the interval ``[low, high)``. If ``high`` is ``None``, the interval is ``[0, low)``. Supported dtypes: any numpy integers. If the size of the interval is a power of 2, a fixed number of counters is used in each thread. Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object. """ if high is None: low, high = 0, low + 1 else: assert low < high - 1 dtype = dtypes.normalize_type(dtype) ctype = dtypes.ctype(dtype) if dtype.kind == 'i': assert low >= -2**(dtype.itemsize * 8 - 1) assert high < 2**(dtype.itemsize * 8 - 1) else: assert low >= 0 assert high < 2**(dtype.itemsize * 8) num = high - low if num <= 2**32: raw_dtype = numpy.dtype('uint32') else: raw_dtype = numpy.dtype('uint64') raw_func = bijection.raw_functions[raw_dtype] max_num = 2**(raw_dtype.itemsize * 8) raw_ctype = dtypes.ctype(dtypes.normalize_type(raw_dtype)) module = Module(TEMPLATE.get_def("uniform_integer"), render_kwds=dict(bijection=bijection, dtype=dtype, ctype=ctype, raw_ctype=raw_ctype, raw_func=raw_func, max_num=max_num, num=num, low=low)) return Sampler(bijection, module, dtype, deterministic=(max_num % num == 0))
def test_dtype_support(thr, dtype): # Test passes if either thread correctly reports that it does not support given dtype, # or it successfully compiles kernel that operates with this dtype. N = 256 if not thr.device_params.supports_dtype(dtype): pytest.skip() mul = functions.mul(dtype, dtype) div = functions.div(dtype, dtype) program = thr.compile( """ KERNEL void test( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *a, GLOBAL_MEM ${ctype} *b) { const SIZE_T i = get_global_id(0); ${ctype} temp = ${mul}(a[i], b[i]); dest[i] = ${div}(temp, b[i]); } """, render_kwds=dict(ctype=dtypes.ctype(dtype), dtype=dtype, mul=mul, div=div)) test = program.test # we need results to fit even in unsigned char a = get_test_array(N, dtype, high=8) b = get_test_array(N, dtype, no_zeros=True, high=8) a_dev = thr.to_device(a) b_dev = thr.to_device(b) dest_dev = thr.empty_like(a_dev) test(dest_dev, a_dev, b_dev, global_size=N) assert diff_is_negligible(thr.from_device(dest_dev), a)
def gamma(bijection, dtype, shape=1, scale=1): """ Generates random numbers from the gamma distribution .. math:: P(x) = x^{k-1} \\frac{e^{-x/\\theta}}{\\theta^k \\Gamma(k)}, where :math:`k` is ``shape``, and :math:`\\theta` is ``scale``. Supported dtypes: ``float(32/64)``. Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object. """ ctype = dtypes.ctype(dtype) uf = uniform_float(bijection, dtype, low=0, high=1) nbm = normal_bm(bijection, dtype, mean=0, std=1) module = Module(TEMPLATE.get_def("gamma"), render_kwds=dict(dtype=dtype, ctype=ctype, bijection=bijection, shape=shape, scale=dtypes.c_constant(scale, dtype), uf=uf, nbm=nbm)) return Sampler(bijection, module, dtype)
def __init__(self, ff_elem, module, use_constant_memory): self.ff = ff_elem self.module = module self.use_constant_memory = use_constant_memory self.transform_length = 1024 self.elem_dtype = numpy.dtype('uint64') self.elem_ctype = ff_elem.module self.polynomial_length = 1024 self.polynomial_dtype = numpy.dtype('int32') self.polynomial_ctype = dtypes.ctype(self.polynomial_dtype) self.threads_per_transform = 128 self.temp_dtype = numpy.dtype('uint64') self.temp_ctype = ff_elem.module self.temp_length = 1024 twd, twd_inv, twd_sqrt, twd_sqrt_inv = gen_twiddle_ref() self.cdata_fw = numpy.concatenate([twd, twd_sqrt]) self.cdata_inv = numpy.concatenate([twd_inv, twd_sqrt_inv]) self.cdata_fw_ctype = ff_elem.module self.cdata_inv_ctype = ff_elem.module
def vonmises(bijection, dtype, mu=0, kappa=1): """ Generates random numbers from the von Mises distribution .. math:: P(x) = \\frac{\\exp(\\kappa \\cos(x - \\mu))}{2 \\pi I_0(\\kappa)}, where :math:`\\mu` is the mode, :math:`\\kappa` is the dispersion, and :math:`I_0` is the modified Bessel function of the first kind. Supported dtypes: ``float(32/64)``. Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object. """ ctype = dtypes.ctype(dtype) uf = uniform_float(bijection, dtype, low=0, high=1) module = Module(TEMPLATE.get_def("vonmises"), render_kwds=dict(dtype=dtype, ctype=ctype, bijection=bijection, mu=dtypes.c_constant(mu, dtype), kappa=kappa, uf=uf)) return Sampler(bijection, module, dtype)
def uniform_float(bijection, dtype, low=0, high=1): """ Generates uniformly distributed floating-points numbers in the interval ``[low, high)``. Supported dtypes: ``float(32/64)``. A fixed number of counters is used in each thread. Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object. """ assert low < high ctype = dtypes.ctype(dtype) bitness = 64 if dtypes.is_double(dtype) else 32 raw_func = 'get_raw_uint' + str(bitness) raw_max = dtypes.c_constant(2 ** bitness, dtype) size = dtypes.c_constant(high - low, dtype) low = dtypes.c_constant(low, dtype) module = Module( TEMPLATE.get_def("uniform_float"), render_kwds=dict( bijection=bijection, ctype=ctype, raw_func=raw_func, raw_max=raw_max, size=size, low=low)) return Sampler(bijection, module, dtype, deterministic=True)
def uniform_float(bijection, dtype, low=0, high=1): """ Generates uniformly distributed floating-points numbers in the interval ``[low, high)``. Supported dtypes: ``float(32/64)``. A fixed number of counters is used in each thread. Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object. """ assert low < high ctype = dtypes.ctype(dtype) bitness = 64 if dtypes.is_double(dtype) else 32 raw_func = 'get_raw_uint' + str(bitness) raw_max = dtypes.c_constant(2**bitness, dtype) size = dtypes.c_constant(high - low, dtype) low = dtypes.c_constant(low, dtype) module = Module(TEMPLATE.get_def("uniform_float"), render_kwds=dict(bijection=bijection, ctype=ctype, raw_func=raw_func, raw_max=raw_max, size=size, low=low)) return Sampler(bijection, module, dtype, deterministic=True)
def uniform_integer(bijection, dtype, low, high=None): """ Generates uniformly distributed integer numbers in the interval ``[low, high)``. If ``high`` is ``None``, the interval is ``[0, low)``. Supported dtypes: any numpy integers. If the size of the interval is a power of 2, a fixed number of counters is used in each thread. Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object. """ if high is None: low, high = 0, low + 1 else: assert low < high - 1 dtype = dtypes.normalize_type(dtype) ctype = dtypes.ctype(dtype) if dtype.kind == 'i': assert low >= -2 ** (dtype.itemsize * 8 - 1) assert high < 2 ** (dtype.itemsize * 8 - 1) else: assert low >= 0 assert high < 2 ** (dtype.itemsize * 8) num = high - low if num <= 2 ** 32: raw_dtype = numpy.dtype('uint32') else: raw_dtype = numpy.dtype('uint64') raw_func = bijection.raw_functions[raw_dtype] max_num = 2 ** (raw_dtype.itemsize * 8) raw_ctype = dtypes.ctype(dtypes.normalize_type(raw_dtype)) module = Module( TEMPLATE.get_def("uniform_integer"), render_kwds=dict( bijection=bijection, dtype=dtype, ctype=ctype, raw_ctype=raw_ctype, raw_func=raw_func, max_num=max_num, num=num, low=low)) return Sampler(bijection, module, dtype, deterministic=(max_num % num == 0))
def get_nonlinear_wrapper(c_dtype, nonlinear_module, dt): s_dtype = dtypes.real_for(c_dtype) return Module.create(""" %for comp in (0, 1): INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}( ${c_ctype} psi0, ${c_ctype} psi1, ${s_ctype} t) { ${c_ctype} nonlinear = ${nonlinear}${comp}(psi0, psi1, t); return ${mul}( COMPLEX_CTR(${c_ctype})(0, -${dt}), nonlinear); } %endfor """, render_kwds=dict(c_ctype=dtypes.ctype(c_dtype), s_ctype=dtypes.ctype(s_dtype), mul=functions.mul(c_dtype, c_dtype), dt=dtypes.c_constant(dt, s_dtype), nonlinear=nonlinear_module))
def normal_bm(bijection, dtype, mean=0, std=1): """ Generates normally distributed random numbers with the mean ``mean`` and the standard deviation ``std`` using Box-Muller transform. Supported dtypes: ``float(32/64)``, ``complex(64/128)``. Produces two random numbers per call for real types and one number for complex types. Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object. .. note:: In case of a complex ``dtype``, ``std`` refers to the standard deviation of the complex numbers (same as ``numpy.std()`` returns), not real and imaginary components (which will be normally distributed with the standard deviation ``std / sqrt(2)``). Consequently, while ``mean`` is of type ``dtype``, ``std`` must be real. """ if dtypes.is_complex(dtype): r_dtype = dtypes.real_for(dtype) c_dtype = dtype else: r_dtype = dtype c_dtype = dtypes.complex_for(dtype) uf = uniform_float(bijection, r_dtype, low=0, high=1) module = Module(TEMPLATE.get_def("normal_bm"), render_kwds=dict(complex_res=dtypes.is_complex(dtype), r_dtype=r_dtype, r_ctype=dtypes.ctype(r_dtype), c_dtype=c_dtype, c_ctype=dtypes.ctype(c_dtype), polar_unit=functions.polar_unit(r_dtype), bijection=bijection, mean=mean, std=std, uf=uf)) return Sampler(bijection, module, dtype, deterministic=uf.deterministic, randoms_per_call=1 if dtypes.is_complex(dtype) else 2)
def normal_bm(bijection, dtype, mean=0, std=1): """ Generates normally distributed random numbers with the mean ``mean`` and the standard deviation ``std`` using Box-Muller transform. Supported dtypes: ``float(32/64)``, ``complex(64/128)``. Produces two random numbers per call for real types and one number for complex types. Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object. .. note:: In case of a complex ``dtype``, ``std`` refers to the standard deviation of the complex numbers (same as ``numpy.std()`` returns), not real and imaginary components (which will be normally distributed with the standard deviation ``std / sqrt(2)``). Consequently, while ``mean`` is of type ``dtype``, ``std`` must be real. """ if dtypes.is_complex(dtype): r_dtype = dtypes.real_for(dtype) c_dtype = dtype else: r_dtype = dtype c_dtype = dtypes.complex_for(dtype) uf = uniform_float(bijection, r_dtype, low=0, high=1) module = Module( TEMPLATE.get_def("normal_bm"), render_kwds=dict( complex_res=dtypes.is_complex(dtype), r_dtype=r_dtype, r_ctype=dtypes.ctype(r_dtype), c_dtype=c_dtype, c_ctype=dtypes.ctype(c_dtype), polar_unit=functions.polar_unit(r_dtype), bijection=bijection, mean=mean, std=std, uf=uf)) return Sampler( bijection, module, dtype, deterministic=uf.deterministic, randoms_per_call=1 if dtypes.is_complex(dtype) else 2)
def philox(bitness, counter_words, rounds=10): """ A CBRNG based on a low number of slow rounds (multiplications). :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers. :param counter_words: ``2`` or ``4``, number of integers generated in one go. :param rounds: ``1`` to ``12``, the more rounds, the better randomness is achieved. Default values are big enough to qualify as PRNG. :returns: a :py:class:`Bijection` object. """ W_CONSTANTS = { 64: [ numpy.uint64(0x9E3779B97F4A7C15), # golden ratio numpy.uint64(0xBB67AE8584CAA73B) # sqrt(3)-1 ], 32: [ numpy.uint32(0x9E3779B9), # golden ratio numpy.uint32(0xBB67AE85) # sqrt(3)-1 ] } M_CONSTANTS = { (64, 2): [numpy.uint64(0xD2B74407B1CE6E93)], (64, 4): [numpy.uint64(0xD2E7470EE14C6C93), numpy.uint64(0xCA5A826395121157)], (32, 2): [numpy.uint32(0xD256D193)], (32, 4): [numpy.uint32(0xD2511F53), numpy.uint32(0xCD9E8D57)] } assert 1 <= rounds <= 12 word_dtype = dtypes.normalize_type(numpy.uint32 if bitness == 32 else numpy.uint64) key_words = counter_words // 2 key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types( word_dtype, key_words, counter_words) module = Module(TEMPLATE.get_def("philox"), render_kwds=dict(word_dtype=word_dtype, word_ctype=dtypes.ctype(word_dtype), key_words=key_words, counter_words=counter_words, key_ctype=key_ctype, counter_ctype=counter_ctype, rounds=rounds, w_constants=W_CONSTANTS[bitness], m_constants=M_CONSTANTS[(bitness, counter_words)])) return Bijection(module, word_dtype, key_dtype, counter_dtype)
def check_kernel_sampler(thr, sampler, extent=None, mean=None, std=None): size = 10000 batch = 100 seed = 456 bijection = sampler.bijection keygen = KeyGenerator.create(bijection, seed=seed) rng_kernel = thr.compile_static(""" KERNEL void test(GLOBAL_MEM ${ctype} *dest, int ctr_start) { VIRTUAL_SKIP_THREADS; const VSIZE_T idx = virtual_global_id(0); ${bijection.module}Key key = ${keygen.module}key_from_int(idx); ${bijection.module}Counter ctr = ${bijection.module}make_counter_from_int(ctr_start); ${bijection.module}State st = ${bijection.module}make_state(key, ctr); ${sampler.module}Result res; for(int j = 0; j < ${batch}; j++) { res = ${sampler.module}sample(&st); %for i in range(sampler.randoms_per_call): dest[j * ${size * sampler.randoms_per_call} + ${size * i} + idx] = res.v[${i}]; %endfor } ${bijection.module}Counter next_ctr = ${bijection.module}get_next_unused_counter(st); } """, 'test', size, render_kwds=dict(size=size, batch=batch, ctype=dtypes.ctype( sampler.dtype), bijection=bijection, keygen=keygen, sampler=sampler)) dest = thr.array((batch, sampler.randoms_per_call, size), sampler.dtype) rng_kernel(dest, numpy.int32(0)) dest = dest.get() check_distribution(dest, extent=extent, mean=mean, std=std)
def philox(bitness, counter_words, rounds=10): """ A CBRNG based on a low number of slow rounds (multiplications). :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers. :param counter_words: ``2`` or ``4``, number of integers generated in one go. :param rounds: ``1`` to ``12``, the more rounds, the better randomness is achieved. Default values are big enough to qualify as PRNG. :returns: a :py:class:`Bijection` object. """ W_CONSTANTS = { 64: [ numpy.uint64(0x9E3779B97F4A7C15), # golden ratio numpy.uint64(0xBB67AE8584CAA73B) # sqrt(3)-1 ], 32: [ numpy.uint32(0x9E3779B9), # golden ratio numpy.uint32(0xBB67AE85) # sqrt(3)-1 ] } M_CONSTANTS = { (64,2): [numpy.uint64(0xD2B74407B1CE6E93)], (64,4): [numpy.uint64(0xD2E7470EE14C6C93), numpy.uint64(0xCA5A826395121157)], (32,2): [numpy.uint32(0xD256D193)], (32,4): [numpy.uint32(0xD2511F53), numpy.uint32(0xCD9E8D57)] } assert 1 <= rounds <= 12 word_dtype = dtypes.normalize_type(numpy.uint32 if bitness == 32 else numpy.uint64) key_words = counter_words // 2 key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types( word_dtype, key_words, counter_words) module = Module( TEMPLATE.get_def("philox"), render_kwds=dict( word_dtype=word_dtype, word_ctype=dtypes.ctype(word_dtype), key_words=key_words, counter_words=counter_words, key_ctype=key_ctype, counter_ctype=counter_ctype, rounds=rounds, w_constants=W_CONSTANTS[bitness], m_constants=M_CONSTANTS[(bitness, counter_words)])) return Bijection(module, word_dtype, key_dtype, counter_dtype)
def check_kernel_sampler(thr, sampler, extent=None, mean=None, std=None): size = 10000 batch = 100 seed = 456 bijection = sampler.bijection keygen = KeyGenerator.create(bijection, seed=seed) rng_kernel = thr.compile_static( """ KERNEL void test(GLOBAL_MEM ${ctype} *dest, int ctr_start) { VIRTUAL_SKIP_THREADS; const VSIZE_T idx = virtual_global_id(0); ${bijection.module}Key key = ${keygen.module}key_from_int(idx); ${bijection.module}Counter ctr = ${bijection.module}make_counter_from_int(ctr_start); ${bijection.module}State st = ${bijection.module}make_state(key, ctr); ${sampler.module}Result res; for(int j = 0; j < ${batch}; j++) { res = ${sampler.module}sample(&st); %for i in range(sampler.randoms_per_call): dest[j * ${size * sampler.randoms_per_call} + ${size * i} + idx] = res.v[${i}]; %endfor } ${bijection.module}Counter next_ctr = ${bijection.module}get_next_unused_counter(st); } """, 'test', size, render_kwds=dict( size=size, batch=batch, ctype=dtypes.ctype(sampler.dtype), bijection=bijection, keygen=keygen, sampler=sampler)) dest = thr.array((batch, sampler.randoms_per_call, size), sampler.dtype) rng_kernel(dest, numpy.int32(0)) dest = dest.get() check_distribution(dest, extent=extent, mean=mean, std=std)
def gamma(bijection, dtype, shape=1, scale=1): """ Generates random numbers from the gamma distribution .. math:: P(x) = x^{k-1} \\frac{e^{-x/\\theta}}{\\theta^k \\Gamma(k)}, where :math:`k` is ``shape``, and :math:`\\theta` is ``scale``. Supported dtypes: ``float(32/64)``. Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object. """ ctype = dtypes.ctype(dtype) uf = uniform_float(bijection, dtype, low=0, high=1) nbm = normal_bm(bijection, dtype, mean=0, std=1) module = Module( TEMPLATE.get_def("gamma"), render_kwds=dict( dtype=dtype, ctype=ctype, bijection=bijection, shape=shape, scale=dtypes.c_constant(scale, dtype), uf=uf, nbm=nbm)) return Sampler(bijection, module, dtype)
def _build_plan(self, plan_factory, device_params, result, phase): plan = plan_factory() tr = Transformation([ Parameter('result', Annotation(result, 'o')), Parameter('phase', Annotation(phase, 'i')), ], """ <% interv = 2**32 // mspace_size half_interv = interv // 2 %> ${phase.ctype} phase = ${phase.load_same}; ${result.store_same}(((unsigned int)phase + ${half_interv}) / ${interv}); """, render_kwds=dict(mspace_size=self._mspace_size, uint64=dtypes.ctype( numpy.uint64)), connectors=['result', 'phase']) plan.computation_call( PureParallel.from_trf(tr, guiding_array='result'), result, phase) return plan
def __init__(self, module): self.module = module self.u64 = dtypes.ctype(numpy.uint64) self.u32 = dtypes.ctype(numpy.uint32) self.modulus = dtypes.c_constant(2**64 - 2**32 + 1, numpy.uint64)
def kspaceepanechnikov_filter_CL2(ksp, sigma): sz = ksp.shape dtype = np.complex64 ftype = np.float32 clear_first_arg_caches() fsiz = (5, 5, 5) print (np.ceil(sigma[0]) + 2, np.ceil(sigma[1]) + 2, np.ceil(sigma[2]) + 2) print sigma fsiz = (np.ceil(sigma) + 2).astype(int) for i in xrange(0, fsiz.size): if not fsiz[i] & 0x1: fsiz[i] += 1 # Create image-domain Epanechikov kernel Kepa = epanechnikov_kernel(fsiz, sigma) # Place kernel at centre of ksp-sized matrix Kfilter = np.zeros(np.array(sz), dtype=np.complex64) szmin = np.floor( np.array(sz) / 2.0 - np.floor(np.array(Kepa.shape) / 2.0) - 1) szmax = np.floor(szmin + np.array(Kepa.shape)) print "Epa filter size ", sz, " image filter ", Kepa.shape, " szmin ", szmin, " szmax ", szmax Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[1], szmin[2]:szmax[2]] = Kepa Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[ 1], szmin[2]:szmax[2]].imag = Kepa # Create fourier-domain Epanechnikov filter api = any_api() thr = api.Thread.create() data_dev = thr.to_device(Kfilter) rfft = FFT(data_dev) crfft = rfft.compile(thr) fftshift = FFTShift(data_dev) cfftshift = fftshift.compile(thr) crfft(data_dev, data_dev) thr.synchronize() cfftshift(data_dev, data_dev) Fepanechnikov = np.abs(data_dev.get()) # / np.prod(np.array(ksp.shape)) #result2 = result2[::-1,::-1,::-1] thr.synchronize() #result = np.zeros(np.array(siz), dtype=np.complex64) #result.real = np.abs(result2) / np.sqrt(2) #result.imag = np.abs(result2) / np.sqrt(2) del data_dev, rfft, crfft, fftshift, cfftshift # Multiply Epanechnikov filter to real and imag ksp data program = thr.compile(""" KERNEL void multiply_them( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *a, GLOBAL_MEM ${ftype} *f) { const SIZE_T i = get_local_id(0); dest[i].x = a[i].x * f[i]; dest[i].y = a[i].y * f[i]; }""", render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype))) data_dev = thr.to_device(ksp) filter_dev = thr.to_device(Fepanechnikov) multiply_them = program.multiply_them multiply_them(data_dev, data_dev, filter_dev, global_size=512 * 512 * 512) thr.synchronize() del filter_dev, program FACTOR = 1.0 # Recon ifft = FFT(data_dev) cifft = ifft.compile(thr) fftshiftobj = FFTShift(data_dev) cfftshift = fftshiftobj.compile(thr) cifft(data_dev, data_dev, inverse=0) thr.synchronize() cfftshift(data_dev, data_dev) thr.synchronize() result2 = data_dev.get() / np.prod(np.array(ksp.shape)) result2 = result2[::-1, ::-1, ::-1] thr.release() return result2
def gaussian_fourierkernel(siz, sigma_): """ Create Gaussian Fourier filter kernel with GPU """ if not hasattr(sigma, "__len__"): # type(sigma) is float: sigma = np.ones(3) * sigma_ elif len(sigma) == 2: sigma[2] = 0.0 sz = siz ctype = np.complex64 ftype = np.float32 #api = cluda.ocl_api() api = any_api() thr = api.Thread.create() base = np.ones(siz, ctype) data_dev = thr.to_device(base) FACTOR = 1.0 program = thr.compile(""" KERNEL void gauss_kernel( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const ulong x = get_global_id(0); const SIZE_T dim1= %d; const SIZE_T dim2= %d; const SIZE_T dim3= %d; ${ftype} sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; ${ftype} factor = %f; const double TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; const ${ftype} SQRT2PI = 2.5066282746; const double CUBEDSQRT2PI = 15.749609945722419; const ulong idx = x; ${ftype} i = (${ftype})((x / dim3) / dim2); i = (i - (${ftype})floor((${ftype})(dim1)/2.0))/(${ftype})(dim1); ${ftype} j = (${ftype})(x / dim3); if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); //Account for large global index (stored as ulong) before performing modulus double pre_k=fmod((double)(x) , (double) dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight; dest[idx].y = src[idx].y * weight; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(ctype), ftype=dtypes.ctype(ftype), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel #data_dev = thr.empty_like(ksp_dev) gauss_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2]) gfilter = data_dev.get() thr.synchronize() thr.release() return gfilter
def kspacegaussian_filter_CL2(ksp, sigma): """ Kspace gaussian filter and recon using GPU OpenCL 1. GPU intialisation 2. push KSP complex matrix to GPU 3. declare FFT program 4. declare Complex Gaussian GPU filter program 5. Execute Gaussian GPU program 6. GPU sync 7. Execute FFT Recon 8. Execute FFTshift 9. Retrieve reconstruced complex image from GPU 10. Reorganise image to standard (mimic numpy format) """ sz = ksp.shape dtype = np.complex64 ftype = np.float32 ultype = np.uint64 #api = cluda.ocl_api() api = any_api() thr = api.Thread.create() data_dev = thr.to_device(ksp) ifft = FFT(data_dev) FACTOR = 1.0 program = thr.compile(""" KERNEL void gauss_kernel( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const ulong x = get_global_id(0); const SIZE_T dim1= %d; const SIZE_T dim2= %d; const SIZE_T dim3= %d; ${ftype} sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; ${ftype} factor = %f; const double TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; const ${ftype} SQRT2PI = 2.5066282746; const double CUBEDSQRT2PI = 15.749609945722419; const ulong idx = x; ${ftype} i = (${ftype})((x / dim3) / dim2); i = (i - (${ftype})floor((${ftype})(dim1)/2.0f))/(${ftype})(dim1); ${ftype} j = (${ftype})(x / dim3); if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); // Account for large global index (stored as ulong) before performing modulus double pre_k=fmod((double)(x), (double)dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); // ${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; // ${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight; dest[idx].y = src[idx].y * weight; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel gauss_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2]) thr.synchronize() # Recon #data_dev = thr.to_device(ksp) ifftobj = FFT(data_dev) cifft = ifftobj.compile(thr) fftshiftobj = FFTShift(data_dev) cfftshift = fftshiftobj.compile(thr) cifft(data_dev, data_dev, inverse=0) thr.synchronize() cfftshift(data_dev, data_dev) thr.synchronize() result2 = data_dev.get() / np.prod(np.array(ksp.shape)) result2 = result2[::-1, ::-1, ::-1] thr.release() return result2
if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); //Account for large global index (stored as ulong) before performing modulus double pre_k=fmod((double)(x) , (double) dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight; dest[idx].y = src[idx].y * weight; //(${ftype})k; // } """ % (N, N, N, sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype( ftype), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel r1 = np.ones((N, N, N)).astype(ftype) # /N r2 = np.ones((N, N, N)).astype(ftype) # /N a = r1 + 1j * r2 b = r1 - 1j * r2 a_dev = thr.to_device(a) #b_dev = thr.to_device(b) #c_dev= thr.to_device(b.ravel()) #sigma_dev = thr.to_device(sigma) dest_dev = thr.empty_like(a_dev)
def threefry(bitness, counter_words, rounds=20): """ A CBRNG based on a big number of fast rounds (bit rotations). :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers. :param counter_words: ``2`` or ``4``, number of integers generated in one go. :param rounds: ``1`` to ``72``, the more rounds, the better randomness is achieved. Default values are big enough to qualify as PRNG. :returns: a :py:class:`Bijection` object. """ ROTATION_CONSTANTS = { # These are the R_256 constants from the Threefish reference sources # with names changed to R_64x4... (64, 4): numpy.array([[14, 52, 23, 5, 25, 46, 58, 32], [16, 57, 40, 37, 33, 12, 22, 32]]).T, # Output from skein_rot_search: (srs64_B64-X1000) # Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57 # Start: Tue Mar 1 10:07:48 2011 # rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format (64, 2): numpy.array([[16, 42, 12, 31, 16, 32, 24, 21]]).T, # 4 rounds: minHW = 4 [ 4 4 4 4 ] # 5 rounds: minHW = 8 [ 8 8 8 8 ] # 6 rounds: minHW = 16 [ 16 16 16 16 ] # 7 rounds: minHW = 32 [ 32 32 32 32 ] # 8 rounds: minHW = 64 [ 64 64 64 64 ] # 9 rounds: minHW = 64 [ 64 64 64 64 ] # 10 rounds: minHW = 64 [ 64 64 64 64 ] # 11 rounds: minHW = 64 [ 64 64 64 64 ] # Output from skein_rot_search: (srs-B128-X5000.out) # Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 # Start: Mon Aug 24 22:41:36 2009 # ... # rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format (32, 4): numpy.array([[10, 11, 13, 23, 6, 17, 25, 18], [26, 21, 27, 5, 20, 11, 10, 20]]).T, # 4 rounds: minHW = 3 [ 3 3 3 3 ] # 5 rounds: minHW = 7 [ 7 7 7 7 ] # 6 rounds: minHW = 12 [ 13 12 13 12 ] # 7 rounds: minHW = 22 [ 22 23 22 23 ] # 8 rounds: minHW = 31 [ 31 31 31 31 ] # 9 rounds: minHW = 32 [ 32 32 32 32 ] # 10 rounds: minHW = 32 [ 32 32 32 32 ] # 11 rounds: minHW = 32 [ 32 32 32 32 ] # Output from skein_rot_search (srs32x2-X5000.out) # Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 # Start: Tue Jul 12 11:11:33 2011 # rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format (32, 2): numpy.array([[13, 15, 26, 6, 17, 29, 16, 24]]).T # 4 rounds: minHW = 4 [ 4 4 4 4 ] # 5 rounds: minHW = 6 [ 6 8 6 8 ] # 6 rounds: minHW = 9 [ 9 12 9 12 ] # 7 rounds: minHW = 16 [ 16 24 16 24 ] # 8 rounds: minHW = 32 [ 32 32 32 32 ] # 9 rounds: minHW = 32 [ 32 32 32 32 ] # 10 rounds: minHW = 32 [ 32 32 32 32 ] # 11 rounds: minHW = 32 [ 32 32 32 32 ] } # Taken from Skein PARITY_CONSTANTS = { 64: numpy.uint64(0x1BD11BDAA9FC1A22), 32: numpy.uint32(0x1BD11BDA) } assert 1 <= rounds <= 72 word_dtype = dtypes.normalize_type(numpy.uint32 if bitness == 32 else numpy.uint64) key_words = counter_words key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types( word_dtype, key_words, counter_words) module = Module( TEMPLATE.get_def("threefry"), render_kwds=dict( word_dtype=word_dtype, word_ctype=dtypes.ctype(word_dtype), key_words=key_words, counter_words=counter_words, key_ctype=key_ctype, counter_ctype=counter_ctype, rounds=rounds, rotation_constants=ROTATION_CONSTANTS[(bitness, counter_words)], parity_constant=PARITY_CONSTANTS[bitness])) return Bijection(module, word_dtype, key_dtype, counter_dtype)
def kspaceepanechnikov_filter(ksp, sigma): """ Kspace gaussian filter and recon using GPU OpenCL 1. GPU intialisation 2. push KSP complex matrix to GPU 3. declare FFT program 4. declare Complex Epan GPU filter program 5. Execute Epan GPU program 6. GPU sync 7. Execute FFT Recon 8. Execute FFTshift 9. Retrieve reconstruced complex image from GPU 10. Reorganise image to standard (mimic numpy format) """ sz = ksp.shape dtype = np.complex64 ftype = np.float32 ultype = np.uint64 #api = cluda.ocl_api() api = any_api() thr = api.Thread.create() data_dev = thr.to_device(ksp) ifft = FFT(data_dev) FACTOR = 1.0 program = thr.compile(""" KERNEL void epan_kernel( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const ulong x = get_global_id(0); const SIZE_T dim1= %d; const SIZE_T dim2= %d; const SIZE_T dim3= %d; ${ftype} sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; ${ftype} factor = %f; const double TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; const ${ftype} SQRT2PI = 2.5066282746; const double CUBEDSQRT2PI = 15.749609945722419; const ulong idx = x; ${ftype} i = (${ftype})((x / dim3) / dim2); i = (i - (${ftype})floor((${ftype})(dim1)/2.0f))/(${ftype})(dim1); ${ftype} j = (${ftype})(x / dim3); if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); // Account for large global index (stored as ulong) before performing modulus double pre_k=fmod((double)(x), (double)dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} omega = (i*sigma[0]+j*sigma[1]+k*sigma[2]); ${ftype} omega3 = ((i*sigma[0])*(i*sigma[0])*(i*sigma[0])+(j*sigma[1])*(j*sigma[1])*(j*sigma[1])+(k*sigma[2])*(k*sigma[2])*(k*sigma[2])); ${ftype} weight = 0.423142 * fabs((4 * sin(omega) - 4 * omega * cos(omega)) / omega3); dest[idx].x = src[idx].x * weight; dest[idx].y = src[idx].y * weight; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype)), fast_math=True) epan_kernel = program.epan_kernel #data_dev = thr.empty_like(ksp_dev) epan_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2]) return data_dev()
def test_tempalloc(cluda_api, tempalloc_cls, pack): shape = (10000,) dtype = numpy.int32 thr = cluda_api.Thread.create(temp_alloc=dict( cls=tempalloc_cls, pack_on_alloc=False)) # Dependency graph for the test dependencies = dict( _temp0=[], _temp1=['_temp9', '_temp8', '_temp3', '_temp5', '_temp4', '_temp7', '_temp6', 'input'], _temp10=['output', '_temp7'], _temp11=['_temp7'], _temp2=['input'], _temp3=['_temp1', 'input'], _temp4=['_temp9', '_temp8', '_temp1', '_temp7', '_temp6'], _temp5=['_temp1'], _temp6=['_temp1', '_temp4'], _temp7=['_temp9', '_temp1', '_temp4', 'output', '_temp11', '_temp10'], _temp8=['_temp1', '_temp4'], _temp9=['_temp1', '_temp4', '_temp7'], input=['_temp1', '_temp3', '_temp2'], output=['_temp10', '_temp7']) program = thr.compile( """ KERNEL void fill(GLOBAL_MEM ${ctype} *dest, ${ctype} val) { const SIZE_T i = get_global_id(0); dest[i] = val; } KERNEL void transfer(GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const SIZE_T i = get_global_id(0); dest[i] = src[i]; } """, render_kwds=dict(ctype=dtypes.ctype(dtype))) fill = program.fill transfer = program.transfer arrays = {} transfer_dest = thr.array(shape, dtype) # Allocate temporary arrays with dependencies for name in sorted(dependencies.keys()): deps = dependencies[name] arr_deps = [arrays[d] for d in deps if d in arrays] arrays[name] = thr.temp_array(shape, dtype, dependencies=arr_deps) fill(arrays[name], dtype(0), global_size=shape) if pack: thr.temp_alloc.pack() # Fill arrays with zeros for name in sorted(dependencies.keys()): deps = dependencies[name] arr_deps = [arrays[d] for d in deps if d in arrays] arrays[name] = thr.temp_array(shape, dtype, dependencies=arr_deps) fill(arrays[name], dtype(0), global_size=shape) for i, name in enumerate(sorted(dependencies.keys())): val = dtype(i + 1) fill(arrays[name], val, global_size=shape) for dep in dependencies[name]: # CUDA does not support get() for GPUArray with custom buffers, # So we need to transfer the data to a normal array first. transfer(transfer_dest, arrays[dep], global_size=shape) assert (transfer_dest.get() != val).all()
def test_tempalloc(cluda_api, tempalloc_cls, pack): shape = (10000, ) dtype = numpy.int32 thr = cluda_api.Thread.create( temp_alloc=dict(cls=tempalloc_cls, pack_on_alloc=False)) # Dependency graph for the test dependencies = dict( _temp0=[], _temp1=[ '_temp9', '_temp8', '_temp3', '_temp5', '_temp4', '_temp7', '_temp6', 'input' ], _temp10=['output', '_temp7'], _temp11=['_temp7'], _temp2=['input'], _temp3=['_temp1', 'input'], _temp4=['_temp9', '_temp8', '_temp1', '_temp7', '_temp6'], _temp5=['_temp1'], _temp6=['_temp1', '_temp4'], _temp7=['_temp9', '_temp1', '_temp4', 'output', '_temp11', '_temp10'], _temp8=['_temp1', '_temp4'], _temp9=['_temp1', '_temp4', '_temp7'], input=['_temp1', '_temp3', '_temp2'], output=['_temp10', '_temp7']) program = thr.compile(""" KERNEL void fill(GLOBAL_MEM ${ctype} *dest, ${ctype} val) { const SIZE_T i = get_global_id(0); dest[i] = val; } KERNEL void transfer(GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const SIZE_T i = get_global_id(0); dest[i] = src[i]; } """, render_kwds=dict(ctype=dtypes.ctype(dtype))) fill = program.fill transfer = program.transfer arrays = {} transfer_dest = thr.array(shape, dtype) # Allocate temporary arrays with dependencies for name in sorted(dependencies.keys()): deps = dependencies[name] arr_deps = [arrays[d] for d in deps if d in arrays] arrays[name] = thr.temp_array(shape, dtype, dependencies=arr_deps) fill(arrays[name], dtype(0), global_size=shape) if pack: thr.temp_alloc.pack() # Fill arrays with zeros for name in sorted(dependencies.keys()): deps = dependencies[name] arr_deps = [arrays[d] for d in deps if d in arrays] arrays[name] = thr.temp_array(shape, dtype, dependencies=arr_deps) fill(arrays[name], dtype(0), global_size=shape) for i, name in enumerate(sorted(dependencies.keys())): val = dtype(i + 1) fill(arrays[name], val, global_size=shape) for dep in dependencies[name]: # CUDA does not support get() for GPUArray with custom buffers, # So we need to transfer the data to a normal array first. transfer(transfer_dest, arrays[dep], global_size=shape) assert (transfer_dest.get() != val).all()
def kspacegaussian_filter_CL2(ksp, sigma): sz = ksp.shape dtype = np.complex64 ftype = np.float32 #api = cluda.ocl_api() api = cuda_api() thr = api.Thread.create() data_dev = thr.to_device(ksp) ifft = FFT(data_dev) FACTOR = 1.0 program = thr.compile(""" KERNEL void gauss_kernel( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const ulong x = get_global_id(0); const SIZE_T dim1= %d; const SIZE_T dim2= %d; const SIZE_T dim3= %d; ${ftype} sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; ${ftype} factor = %f; const double TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; // const ${ftype} SQRT2PI = 2.5066282746; // const double CUBEDSQRT2PI = 15.749609945722419; const ulong idx = x; ${ftype} i = (${ftype})((x / dim3) / dim2); i = (i - (${ftype})floor((${ftype})(dim1)/2.0))/(${ftype})(dim1); ${ftype} j = (${ftype})(x / dim3); if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); //Account for large global index (stored as ulong) before performing modulus double pre_k=fmod((double)(x) , (double) dim3); ${ftype} k = (${ftype}) pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight * factor; dest[idx].y = src[idx].y * weight * factor; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel #data_dev = thr.empty_like(ksp_dev) gauss_kernel(data_dev, data_dev, global_size=(sz[0], sz[1], sz[2])) thr.synchronize() #data_dev = thr.to_device(ksp) ifft = FFT(data_dev) cifft = ifft.compile(thr) fftshift = FFTShift(data_dev) cfftshift = fftshift.compile(thr) cifft(data_dev, data_dev, inverse=0) thr.synchronize() cfftshift(data_dev, data_dev) thr.synchronize() result2 = data_dev.get() / np.prod(np.array(ksp.shape)) result2 = result2[::-1, ::-1, ::-1] thr.release() return result2
def threefry(bitness, counter_words, rounds=20): """ A CBRNG based on a big number of fast rounds (bit rotations). :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers. :param counter_words: ``2`` or ``4``, number of integers generated in one go. :param rounds: ``1`` to ``72``, the more rounds, the better randomness is achieved. Default values are big enough to qualify as PRNG. :returns: a :py:class:`Bijection` object. """ ROTATION_CONSTANTS = { # These are the R_256 constants from the Threefish reference sources # with names changed to R_64x4... (64, 4): numpy.array([[14, 52, 23, 5, 25, 46, 58, 32], [16, 57, 40, 37, 33, 12, 22, 32]]).T, # Output from skein_rot_search: (srs64_B64-X1000) # Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57 # Start: Tue Mar 1 10:07:48 2011 # rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format (64, 2): numpy.array([[16, 42, 12, 31, 16, 32, 24, 21]]).T, # 4 rounds: minHW = 4 [ 4 4 4 4 ] # 5 rounds: minHW = 8 [ 8 8 8 8 ] # 6 rounds: minHW = 16 [ 16 16 16 16 ] # 7 rounds: minHW = 32 [ 32 32 32 32 ] # 8 rounds: minHW = 64 [ 64 64 64 64 ] # 9 rounds: minHW = 64 [ 64 64 64 64 ] # 10 rounds: minHW = 64 [ 64 64 64 64 ] # 11 rounds: minHW = 64 [ 64 64 64 64 ] # Output from skein_rot_search: (srs-B128-X5000.out) # Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 # Start: Mon Aug 24 22:41:36 2009 # ... # rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format (32, 4): numpy.array([[10, 11, 13, 23, 6, 17, 25, 18], [26, 21, 27, 5, 20, 11, 10, 20]]).T, # 4 rounds: minHW = 3 [ 3 3 3 3 ] # 5 rounds: minHW = 7 [ 7 7 7 7 ] # 6 rounds: minHW = 12 [ 13 12 13 12 ] # 7 rounds: minHW = 22 [ 22 23 22 23 ] # 8 rounds: minHW = 31 [ 31 31 31 31 ] # 9 rounds: minHW = 32 [ 32 32 32 32 ] # 10 rounds: minHW = 32 [ 32 32 32 32 ] # 11 rounds: minHW = 32 [ 32 32 32 32 ] # Output from skein_rot_search (srs32x2-X5000.out) # Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 # Start: Tue Jul 12 11:11:33 2011 # rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format (32, 2): numpy.array([[13, 15, 26, 6, 17, 29, 16, 24]]).T # 4 rounds: minHW = 4 [ 4 4 4 4 ] # 5 rounds: minHW = 6 [ 6 8 6 8 ] # 6 rounds: minHW = 9 [ 9 12 9 12 ] # 7 rounds: minHW = 16 [ 16 24 16 24 ] # 8 rounds: minHW = 32 [ 32 32 32 32 ] # 9 rounds: minHW = 32 [ 32 32 32 32 ] # 10 rounds: minHW = 32 [ 32 32 32 32 ] # 11 rounds: minHW = 32 [ 32 32 32 32 ] } # Taken from Skein PARITY_CONSTANTS = { 64: numpy.uint64(0x1BD11BDAA9FC1A22), 32: numpy.uint32(0x1BD11BDA) } assert 1 <= rounds <= 72 word_dtype = dtypes.normalize_type(numpy.uint32 if bitness == 32 else numpy.uint64) key_words = counter_words key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types( word_dtype, key_words, counter_words) module = Module(TEMPLATE.get_def("threefry"), render_kwds=dict( word_dtype=word_dtype, word_ctype=dtypes.ctype(word_dtype), key_words=key_words, counter_words=counter_words, key_ctype=key_ctype, counter_ctype=counter_ctype, rounds=rounds, rotation_constants=ROTATION_CONSTANTS[(bitness, counter_words)], parity_constant=PARITY_CONSTANTS[bitness])) return Bijection(module, word_dtype, key_dtype, counter_dtype)
def get_ff_elem(): module = Module(TEMPLATE.get_def('ff_elem_def'), render_kwds=dict(u64=dtypes.ctype(numpy.uint64))) return FiniteFieldElement(module)