示例#1
0
文件: fft.py 项目: xbee/nufhe
    def __init__(self, module, use_constant_memory):
        self.module = module
        self.use_constant_memory = use_constant_memory

        self.transform_length = 512
        self.elem_dtype = numpy.dtype('complex128')
        self.elem_ctype = dtypes.ctype(self.elem_dtype)

        self.polynomial_length = 1024
        self.polynomial_dtype = numpy.int32
        self.polynomial_ctype = dtypes.ctype(self.polynomial_dtype)

        self.threads_per_transform = 64
        self.temp_dtype = numpy.dtype('float64')
        self.temp_ctype = dtypes.ctype(self.temp_dtype)
        self.temp_length = 576

        twd_fw = numpy.empty((8, 64), numpy.complex128)
        twd_inv = numpy.empty((8, 64), numpy.complex128)
        for i in range(8):
            for elem_id in range(64):
                twd_fw[i, elem_id] = numpy.exp(
                    -2j * numpy.pi / self.transform_length * i * elem_id)
                twd_inv[i, elem_id] = numpy.exp(
                    2j * numpy.pi / self.transform_length * i * elem_id)

        idxs = numpy.arange(self.polynomial_length // 2)
        coeffs = numpy.exp(-2j * numpy.pi * idxs / self.polynomial_length / 2)

        self.cdata_fw = numpy.concatenate([twd_fw.flatten(), coeffs])
        self.cdata_inv = numpy.concatenate(
            [twd_inv.flatten(), coeffs / self.transform_length])
        self.cdata_fw_ctype = dtypes.ctype(self.cdata_fw.dtype)
        self.cdata_inv_ctype = dtypes.ctype(self.cdata_inv.dtype)
示例#2
0
def nonlinear_no_potential(dtype, U, nu):
    c_dtype = dtype
    c_ctype = dtypes.ctype(c_dtype)
    s_dtype = dtypes.real_for(dtype)
    s_ctype = dtypes.ctype(s_dtype)

    return Module.create(
        """
        %for comp in (0, 1):
        INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}(
            ${c_ctype} psi0, ${c_ctype} psi1, ${s_ctype} t)
        {
            return (
                ${mul}(psi${comp}, (
                    ${dtypes.c_constant(U[comp, 0])} * ${norm}(psi0) +
                    ${dtypes.c_constant(U[comp, 1])} * ${norm}(psi1)
                    ))
                - ${mul}(psi${1 - comp}, ${nu})
                );
        }
        %endfor
        """,
        render_kwds=dict(
            mul=functions.mul(c_dtype, s_dtype),
            norm=functions.norm(c_dtype),
            U=U,
            nu=dtypes.c_constant(nu, s_dtype),
            c_ctype=c_ctype,
            s_ctype=s_ctype))
def get_nonlinear_wrapper(components, c_dtype, nonlinear_module, dt):
    s_dtype = dtypes.real_for(c_dtype)
    return Module.create(
        """
        %for comp in range(components):
        INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}(
            %for pcomp in range(components):
            ${c_ctype} psi${pcomp},
            %endfor
            ${s_ctype} V, ${s_ctype} t)
        {
            ${c_ctype} nonlinear = ${nonlinear}${comp}(
                %for pcomp in range(components):
                psi${pcomp},
                %endfor
                V, t);
            return ${mul}(
                COMPLEX_CTR(${c_ctype})(0, -${dt}),
                nonlinear);
        }
        %endfor
        """,
        render_kwds=dict(
            components=components,
            c_ctype=dtypes.ctype(c_dtype),
            s_ctype=dtypes.ctype(s_dtype),
            mul=functions.mul(c_dtype, c_dtype),
            dt=dtypes.c_constant(dt, s_dtype),
            nonlinear=nonlinear_module))
示例#4
0
def kspacegaussian_filter_CL(ksp, sigma):
    from reikna import cluda
    from reikna.cluda import functions, dtypes
    sz = np.array(ksp.shape)
    dtype = np.complex64
    ftype = np.float32
    api = cluda.ocl_api()
    thr = api.Thread.create()
    FACTOR = 1.0
    program = thr.compile("""
KERNEL void gauss_kernel(
    GLOBAL_MEM ${ctype} *dest,
    GLOBAL_MEM ${ctype} *src)
{
  const ${ultype} x = (${ultype}) get_global_id(0);
  const SIZE_T dim1= %d;
  const SIZE_T dim2= %d;
  const SIZE_T dim3= %d;                    
  ${ftype} sigma[3];
  sigma[0]=%f;sigma[1]=%f;sigma[2]=%f;
  ${ftype} factor = %f;            
  const double TWOPISQ = 19.739208802178716; //6.283185307179586;  //2*3.141592;
  const ${ftype} SQRT2PI = 2.5066282746;
  const double CUBEDSQRT2PI = 15.749609945722419;
  const ${ultype} idx = x;
  ${ftype} i = (${ftype})((x / dim3) / dim2);
      i = (i - (${ftype})floor((${ftype})(dim1)/2.0))/(${ftype})(dim1);
  ${ftype} j = (${ftype})(x / dim3);
      if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);};
      j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2);
  //Account for large global index (stored as ulong) before performing modulus
  double pre_k=fmod((double)(x) , (double) dim3);
  ${ftype} k = (${ftype}) pre_k;
      k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3);

  ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1;
  //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  dest[idx].x = src[idx].x * weight;
  dest[idx].y = src[idx].y * weight; 
  
}
""" % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR),
        render_kwds=dict(ctype=dtypes.ctype(dtype),
                         ftype=dtypes.ctype(ftype), ultype=dtypes.ctype(np.uint64),
                         exp=functions.exp(ftype)), fast_math=True)

    gauss_kernel = program.gauss_kernel
    data_dev = thr.empty_like(ksp)
    gauss_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2])
    ksp_out = data_dev.get()

    ifft = FFT(data_dev)
    cifft = ifft.compile(thr)
    cifft(data_dev, data_dev, inverse=0)
    result = np.fft.fftshift(data_dev.get() / sz[0] * sz[1] * sz[2])
    result = result[::-1, ::-1, ::-1]
    result = np.roll(np.roll(np.roll(result, 1, axis=2), 1, axis=1), 1, axis=0)

    return ksp_out
示例#5
0
def get_nonlinear_wrapper(state_dtype, grid_dims, drift, diffusion=None):

    real_dtype = dtypes.real_for(state_dtype)
    if diffusion is not None:
        noise_dtype = diffusion.dtype
    else:
        noise_dtype = real_dtype

    return Module.create(
        """
        <%
            components = drift.components
            idx_args = ["idx_" + str(dim) for dim in range(grid_dims)]
            psi_args = ["psi_" + str(comp) for comp in range(components)]
            if diffusion is not None:
                dW_args = ["dW_" + str(ncomp) for ncomp in range(diffusion.noise_sources)]
        %>
        %for comp in range(components):
        INLINE WITHIN_KERNEL ${s_ctype} ${prefix}${comp}(
            %for idx in idx_args:
            const int ${idx},
            %endfor
            %for psi in psi_args:
            const ${s_ctype} ${psi},
            %endfor
            %if diffusion is not None:
            %for dW in dW_args:
            const ${n_ctype} ${dW},
            %endfor
            %endif
            const ${r_ctype} t,
            const ${r_ctype} dt)
        {
            return
                ${mul_sr}(${drift.module}${comp}(
                    ${", ".join(idx_args)}, ${", ".join(psi_args)}, t), dt)
                %if diffusion is not None:
                %for ncomp in range(diffusion.noise_sources):
                + ${mul_sn}(${diffusion.module}${comp}_${ncomp}(
                    ${", ".join(idx_args)}, ${", ".join(psi_args)}, t), ${dW_args[ncomp]})
                %endfor
                %endif
                ;
        }
        %endfor
        """,
        render_kwds=dict(
            grid_dims=grid_dims,
            s_ctype=dtypes.ctype(state_dtype),
            r_ctype=dtypes.ctype(real_dtype),
            n_ctype=dtypes.ctype(noise_dtype),
            mul_sr=functions.mul(state_dtype, real_dtype),
            mul_sn=functions.mul(state_dtype, noise_dtype),
            drift=drift,
            diffusion=diffusion))
示例#6
0
文件: fft.py 项目: xbee/nufhe
def fft512(use_constant_memory=False):
    module = Module(TEMPLATE.get_def('fft512'),
                    render_kwds=dict(
                        elem_ctype=dtypes.ctype(numpy.complex128),
                        temp_ctype=dtypes.ctype(numpy.float64),
                        cdata_ctype=dtypes.ctype(numpy.complex128),
                        polar_unit=functions.polar_unit(numpy.float64),
                        mul=functions.mul(numpy.complex128, numpy.complex128),
                        use_constant_memory=use_constant_memory,
                    ))
    return FFT512(module, use_constant_memory)
def get_nonlinear(dtype, interaction, tunneling):
    r"""
    Nonlinear module

    .. math::

        N(\psi_1, ... \psi_C)
        = \sum_{n=1}^{C} U_{jn} |\psi_n|^2 \psi_j
          - \nu_j psi_{m_j}

    ``interaction``: a symmetrical ``components x components`` array with interaction strengths.
    ``tunneling``: a list of (other_comp, coeff) pairs of tunnelling strengths.
    """

    c_dtype = dtype
    c_ctype = dtypes.ctype(c_dtype)
    s_dtype = dtypes.real_for(dtype)
    s_ctype = dtypes.ctype(s_dtype)

    return Module.create(
        """
        %for comp in range(components):
        INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}(
            %for pcomp in range(components):
            ${c_ctype} psi${pcomp},
            %endfor
            ${s_ctype} V, ${s_ctype} t)
        {
            return (
                ${mul}(psi${comp}, (
                    %for other_comp in range(components):
                    + ${dtypes.c_constant(interaction[comp, other_comp], s_dtype)} *
                        ${norm}(psi${other_comp})
                    %endfor
                    + V
                    ))
                - ${mul}(
                    psi${tunneling[comp][0]},
                    ${dtypes.c_constant(tunneling[comp][1], s_dtype)})
                );
        }
        %endfor
        """,
        render_kwds=dict(
            components=interaction.shape[0],
            mul=functions.mul(c_dtype, s_dtype),
            norm=functions.norm(c_dtype),
            interaction=interaction,
            tunneling=tunneling,
            s_dtype=s_dtype,
            c_ctype=c_ctype,
            s_ctype=s_ctype))
示例#8
0
def uniform_integer(bijection, dtype, low, high=None):
    """
    Generates uniformly distributed integer numbers in the interval ``[low, high)``.
    If ``high`` is ``None``, the interval is ``[0, low)``.
    Supported dtypes: any numpy integers.
    If the size of the interval is a power of 2, a fixed number of counters
    is used in each thread.
    Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object.
    """

    if high is None:
        low, high = 0, low + 1
    else:
        assert low < high - 1

    dtype = dtypes.normalize_type(dtype)
    ctype = dtypes.ctype(dtype)

    if dtype.kind == 'i':
        assert low >= -2**(dtype.itemsize * 8 - 1)
        assert high < 2**(dtype.itemsize * 8 - 1)
    else:
        assert low >= 0
        assert high < 2**(dtype.itemsize * 8)

    num = high - low
    if num <= 2**32:
        raw_dtype = numpy.dtype('uint32')
    else:
        raw_dtype = numpy.dtype('uint64')

    raw_func = bijection.raw_functions[raw_dtype]
    max_num = 2**(raw_dtype.itemsize * 8)

    raw_ctype = dtypes.ctype(dtypes.normalize_type(raw_dtype))

    module = Module(TEMPLATE.get_def("uniform_integer"),
                    render_kwds=dict(bijection=bijection,
                                     dtype=dtype,
                                     ctype=ctype,
                                     raw_ctype=raw_ctype,
                                     raw_func=raw_func,
                                     max_num=max_num,
                                     num=num,
                                     low=low))

    return Sampler(bijection,
                   module,
                   dtype,
                   deterministic=(max_num % num == 0))
示例#9
0
def test_dtype_support(thr, dtype):
    # Test passes if either thread correctly reports that it does not support given dtype,
    # or it successfully compiles kernel that operates with this dtype.

    N = 256

    if not thr.device_params.supports_dtype(dtype):
        pytest.skip()

    mul = functions.mul(dtype, dtype)
    div = functions.div(dtype, dtype)
    program = thr.compile(
    """
    KERNEL void test(
        GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *a, GLOBAL_MEM ${ctype} *b)
    {
      const SIZE_T i = get_global_id(0);
      ${ctype} temp = ${mul}(a[i], b[i]);
      dest[i] = ${div}(temp, b[i]);
    }
    """, render_kwds=dict(ctype=dtypes.ctype(dtype), dtype=dtype, mul=mul, div=div))

    test = program.test

    # we need results to fit even in unsigned char
    a = get_test_array(N, dtype, high=8)
    b = get_test_array(N, dtype, no_zeros=True, high=8)

    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    dest_dev = thr.empty_like(a_dev)
    test(dest_dev, a_dev, b_dev, global_size=N)
    assert diff_is_negligible(thr.from_device(dest_dev), a)
示例#10
0
def gamma(bijection, dtype, shape=1, scale=1):
    """
    Generates random numbers from the gamma distribution

    .. math::
      P(x) = x^{k-1} \\frac{e^{-x/\\theta}}{\\theta^k \\Gamma(k)},

    where :math:`k` is ``shape``, and :math:`\\theta` is ``scale``.
    Supported dtypes: ``float(32/64)``.
    Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object.
    """

    ctype = dtypes.ctype(dtype)
    uf = uniform_float(bijection, dtype, low=0, high=1)
    nbm = normal_bm(bijection, dtype, mean=0, std=1)

    module = Module(TEMPLATE.get_def("gamma"),
                    render_kwds=dict(dtype=dtype,
                                     ctype=ctype,
                                     bijection=bijection,
                                     shape=shape,
                                     scale=dtypes.c_constant(scale, dtype),
                                     uf=uf,
                                     nbm=nbm))

    return Sampler(bijection, module, dtype)
示例#11
0
文件: ntt.py 项目: stjordanis/nufhe
    def __init__(self, ff_elem, module, use_constant_memory):
        self.ff = ff_elem
        self.module = module
        self.use_constant_memory = use_constant_memory

        self.transform_length = 1024
        self.elem_dtype = numpy.dtype('uint64')
        self.elem_ctype = ff_elem.module

        self.polynomial_length = 1024
        self.polynomial_dtype = numpy.dtype('int32')
        self.polynomial_ctype = dtypes.ctype(self.polynomial_dtype)

        self.threads_per_transform = 128

        self.temp_dtype = numpy.dtype('uint64')
        self.temp_ctype = ff_elem.module
        self.temp_length = 1024

        twd, twd_inv, twd_sqrt, twd_sqrt_inv = gen_twiddle_ref()

        self.cdata_fw = numpy.concatenate([twd, twd_sqrt])
        self.cdata_inv = numpy.concatenate([twd_inv, twd_sqrt_inv])
        self.cdata_fw_ctype = ff_elem.module
        self.cdata_inv_ctype = ff_elem.module
示例#12
0
def test_dtype_support(thr, dtype):
    # Test passes if either thread correctly reports that it does not support given dtype,
    # or it successfully compiles kernel that operates with this dtype.

    N = 256

    if not thr.device_params.supports_dtype(dtype):
        pytest.skip()

    mul = functions.mul(dtype, dtype)
    div = functions.div(dtype, dtype)
    program = thr.compile(
    """
    KERNEL void test(
        GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *a, GLOBAL_MEM ${ctype} *b)
    {
      const SIZE_T i = get_global_id(0);
      ${ctype} temp = ${mul}(a[i], b[i]);
      dest[i] = ${div}(temp, b[i]);
    }
    """, render_kwds=dict(ctype=dtypes.ctype(dtype), dtype=dtype, mul=mul, div=div))

    test = program.test

    # we need results to fit even in unsigned char
    a = get_test_array(N, dtype, high=8)
    b = get_test_array(N, dtype, no_zeros=True, high=8)

    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    dest_dev = thr.empty_like(a_dev)
    test(dest_dev, a_dev, b_dev, global_size=N)
    assert diff_is_negligible(thr.from_device(dest_dev), a)
示例#13
0
def vonmises(bijection, dtype, mu=0, kappa=1):
    """
    Generates random numbers from the von Mises distribution

    .. math::
      P(x) = \\frac{\\exp(\\kappa \\cos(x - \\mu))}{2 \\pi I_0(\\kappa)},

    where :math:`\\mu` is the mode, :math:`\\kappa` is the dispersion,
    and :math:`I_0` is the modified Bessel function of the first kind.
    Supported dtypes: ``float(32/64)``.
    Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object.
    """

    ctype = dtypes.ctype(dtype)
    uf = uniform_float(bijection, dtype, low=0, high=1)

    module = Module(TEMPLATE.get_def("vonmises"),
                    render_kwds=dict(dtype=dtype,
                                     ctype=ctype,
                                     bijection=bijection,
                                     mu=dtypes.c_constant(mu, dtype),
                                     kappa=kappa,
                                     uf=uf))

    return Sampler(bijection, module, dtype)
示例#14
0
def uniform_float(bijection, dtype, low=0, high=1):
    """
    Generates uniformly distributed floating-points numbers in the interval ``[low, high)``.
    Supported dtypes: ``float(32/64)``.
    A fixed number of counters is used in each thread.
    Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object.
    """
    assert low < high

    ctype = dtypes.ctype(dtype)

    bitness = 64 if dtypes.is_double(dtype) else 32
    raw_func = 'get_raw_uint' + str(bitness)
    raw_max = dtypes.c_constant(2 ** bitness, dtype)

    size = dtypes.c_constant(high - low, dtype)
    low = dtypes.c_constant(low, dtype)

    module = Module(
        TEMPLATE.get_def("uniform_float"),
        render_kwds=dict(
            bijection=bijection, ctype=ctype,
            raw_func=raw_func, raw_max=raw_max, size=size, low=low))

    return Sampler(bijection, module, dtype, deterministic=True)
示例#15
0
def uniform_float(bijection, dtype, low=0, high=1):
    """
    Generates uniformly distributed floating-points numbers in the interval ``[low, high)``.
    Supported dtypes: ``float(32/64)``.
    A fixed number of counters is used in each thread.
    Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object.
    """
    assert low < high

    ctype = dtypes.ctype(dtype)

    bitness = 64 if dtypes.is_double(dtype) else 32
    raw_func = 'get_raw_uint' + str(bitness)
    raw_max = dtypes.c_constant(2**bitness, dtype)

    size = dtypes.c_constant(high - low, dtype)
    low = dtypes.c_constant(low, dtype)

    module = Module(TEMPLATE.get_def("uniform_float"),
                    render_kwds=dict(bijection=bijection,
                                     ctype=ctype,
                                     raw_func=raw_func,
                                     raw_max=raw_max,
                                     size=size,
                                     low=low))

    return Sampler(bijection, module, dtype, deterministic=True)
示例#16
0
def uniform_integer(bijection, dtype, low, high=None):
    """
    Generates uniformly distributed integer numbers in the interval ``[low, high)``.
    If ``high`` is ``None``, the interval is ``[0, low)``.
    Supported dtypes: any numpy integers.
    If the size of the interval is a power of 2, a fixed number of counters
    is used in each thread.
    Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object.
    """

    if high is None:
        low, high = 0, low + 1
    else:
        assert low < high - 1

    dtype = dtypes.normalize_type(dtype)
    ctype = dtypes.ctype(dtype)

    if dtype.kind == 'i':
        assert low >= -2 ** (dtype.itemsize * 8 - 1)
        assert high < 2 ** (dtype.itemsize * 8 - 1)
    else:
        assert low >= 0
        assert high < 2 ** (dtype.itemsize * 8)

    num = high - low
    if num <= 2 ** 32:
        raw_dtype = numpy.dtype('uint32')
    else:
        raw_dtype = numpy.dtype('uint64')

    raw_func = bijection.raw_functions[raw_dtype]
    max_num = 2 ** (raw_dtype.itemsize * 8)

    raw_ctype = dtypes.ctype(dtypes.normalize_type(raw_dtype))

    module = Module(
        TEMPLATE.get_def("uniform_integer"),
        render_kwds=dict(
            bijection=bijection,
            dtype=dtype, ctype=ctype,
            raw_ctype=raw_ctype, raw_func=raw_func,
            max_num=max_num, num=num, low=low))

    return Sampler(bijection, module, dtype, deterministic=(max_num % num == 0))
示例#17
0
def get_nonlinear_wrapper(c_dtype, nonlinear_module, dt):
    s_dtype = dtypes.real_for(c_dtype)
    return Module.create("""
        %for comp in (0, 1):
        INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}(
            ${c_ctype} psi0, ${c_ctype} psi1, ${s_ctype} t)
        {
            ${c_ctype} nonlinear = ${nonlinear}${comp}(psi0, psi1, t);
            return ${mul}(
                COMPLEX_CTR(${c_ctype})(0, -${dt}),
                nonlinear);
        }
        %endfor
        """,
                         render_kwds=dict(c_ctype=dtypes.ctype(c_dtype),
                                          s_ctype=dtypes.ctype(s_dtype),
                                          mul=functions.mul(c_dtype, c_dtype),
                                          dt=dtypes.c_constant(dt, s_dtype),
                                          nonlinear=nonlinear_module))
示例#18
0
def normal_bm(bijection, dtype, mean=0, std=1):
    """
    Generates normally distributed random numbers with the mean ``mean`` and
    the standard deviation ``std`` using Box-Muller transform.
    Supported dtypes: ``float(32/64)``, ``complex(64/128)``.
    Produces two random numbers per call for real types and one number for complex types.
    Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object.

    .. note::

        In case of a complex ``dtype``, ``std`` refers to the standard deviation of the
        complex numbers (same as ``numpy.std()`` returns), not real and imaginary components
        (which will be normally distributed with the standard deviation ``std / sqrt(2)``).
        Consequently, while ``mean`` is of type ``dtype``, ``std`` must be real.
    """

    if dtypes.is_complex(dtype):
        r_dtype = dtypes.real_for(dtype)
        c_dtype = dtype
    else:
        r_dtype = dtype
        c_dtype = dtypes.complex_for(dtype)

    uf = uniform_float(bijection, r_dtype, low=0, high=1)

    module = Module(TEMPLATE.get_def("normal_bm"),
                    render_kwds=dict(complex_res=dtypes.is_complex(dtype),
                                     r_dtype=r_dtype,
                                     r_ctype=dtypes.ctype(r_dtype),
                                     c_dtype=c_dtype,
                                     c_ctype=dtypes.ctype(c_dtype),
                                     polar_unit=functions.polar_unit(r_dtype),
                                     bijection=bijection,
                                     mean=mean,
                                     std=std,
                                     uf=uf))

    return Sampler(bijection,
                   module,
                   dtype,
                   deterministic=uf.deterministic,
                   randoms_per_call=1 if dtypes.is_complex(dtype) else 2)
示例#19
0
def normal_bm(bijection, dtype, mean=0, std=1):
    """
    Generates normally distributed random numbers with the mean ``mean`` and
    the standard deviation ``std`` using Box-Muller transform.
    Supported dtypes: ``float(32/64)``, ``complex(64/128)``.
    Produces two random numbers per call for real types and one number for complex types.
    Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object.

    .. note::

        In case of a complex ``dtype``, ``std`` refers to the standard deviation of the
        complex numbers (same as ``numpy.std()`` returns), not real and imaginary components
        (which will be normally distributed with the standard deviation ``std / sqrt(2)``).
        Consequently, while ``mean`` is of type ``dtype``, ``std`` must be real.
    """

    if dtypes.is_complex(dtype):
        r_dtype = dtypes.real_for(dtype)
        c_dtype = dtype
    else:
        r_dtype = dtype
        c_dtype = dtypes.complex_for(dtype)

    uf = uniform_float(bijection, r_dtype, low=0, high=1)

    module = Module(
        TEMPLATE.get_def("normal_bm"),
        render_kwds=dict(
            complex_res=dtypes.is_complex(dtype),
            r_dtype=r_dtype, r_ctype=dtypes.ctype(r_dtype),
            c_dtype=c_dtype, c_ctype=dtypes.ctype(c_dtype),
            polar_unit=functions.polar_unit(r_dtype),
            bijection=bijection,
            mean=mean,
            std=std,
            uf=uf))

    return Sampler(
        bijection, module, dtype,
        deterministic=uf.deterministic, randoms_per_call=1 if dtypes.is_complex(dtype) else 2)
示例#20
0
def philox(bitness, counter_words, rounds=10):
    """
    A CBRNG based on a low number of slow rounds (multiplications).

    :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers.
    :param counter_words: ``2`` or ``4``, number of integers generated in one go.
    :param rounds: ``1`` to ``12``, the more rounds, the better randomness is achieved.
        Default values are big enough to qualify as PRNG.
    :returns: a :py:class:`Bijection` object.
    """

    W_CONSTANTS = {
        64: [
            numpy.uint64(0x9E3779B97F4A7C15),  # golden ratio
            numpy.uint64(0xBB67AE8584CAA73B)  # sqrt(3)-1
        ],
        32: [
            numpy.uint32(0x9E3779B9),  # golden ratio
            numpy.uint32(0xBB67AE85)  # sqrt(3)-1
        ]
    }

    M_CONSTANTS = {
        (64, 2): [numpy.uint64(0xD2B74407B1CE6E93)],
        (64, 4):
        [numpy.uint64(0xD2E7470EE14C6C93),
         numpy.uint64(0xCA5A826395121157)],
        (32, 2): [numpy.uint32(0xD256D193)],
        (32, 4): [numpy.uint32(0xD2511F53),
                  numpy.uint32(0xCD9E8D57)]
    }

    assert 1 <= rounds <= 12
    word_dtype = dtypes.normalize_type(numpy.uint32 if bitness ==
                                       32 else numpy.uint64)
    key_words = counter_words // 2
    key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types(
        word_dtype, key_words, counter_words)

    module = Module(TEMPLATE.get_def("philox"),
                    render_kwds=dict(word_dtype=word_dtype,
                                     word_ctype=dtypes.ctype(word_dtype),
                                     key_words=key_words,
                                     counter_words=counter_words,
                                     key_ctype=key_ctype,
                                     counter_ctype=counter_ctype,
                                     rounds=rounds,
                                     w_constants=W_CONSTANTS[bitness],
                                     m_constants=M_CONSTANTS[(bitness,
                                                              counter_words)]))

    return Bijection(module, word_dtype, key_dtype, counter_dtype)
示例#21
0
def check_kernel_sampler(thr, sampler, extent=None, mean=None, std=None):

    size = 10000
    batch = 100
    seed = 456

    bijection = sampler.bijection
    keygen = KeyGenerator.create(bijection, seed=seed)

    rng_kernel = thr.compile_static("""
        KERNEL void test(GLOBAL_MEM ${ctype} *dest, int ctr_start)
        {
            VIRTUAL_SKIP_THREADS;
            const VSIZE_T idx = virtual_global_id(0);

            ${bijection.module}Key key = ${keygen.module}key_from_int(idx);
            ${bijection.module}Counter ctr = ${bijection.module}make_counter_from_int(ctr_start);
            ${bijection.module}State st = ${bijection.module}make_state(key, ctr);

            ${sampler.module}Result res;

            for(int j = 0; j < ${batch}; j++)
            {
                res = ${sampler.module}sample(&st);

                %for i in range(sampler.randoms_per_call):
                dest[j * ${size * sampler.randoms_per_call} + ${size * i} + idx] = res.v[${i}];
                %endfor
            }

            ${bijection.module}Counter next_ctr = ${bijection.module}get_next_unused_counter(st);
        }
        """,
                                    'test',
                                    size,
                                    render_kwds=dict(size=size,
                                                     batch=batch,
                                                     ctype=dtypes.ctype(
                                                         sampler.dtype),
                                                     bijection=bijection,
                                                     keygen=keygen,
                                                     sampler=sampler))

    dest = thr.array((batch, sampler.randoms_per_call, size), sampler.dtype)
    rng_kernel(dest, numpy.int32(0))
    dest = dest.get()

    check_distribution(dest, extent=extent, mean=mean, std=std)
示例#22
0
def philox(bitness, counter_words, rounds=10):
    """
    A CBRNG based on a low number of slow rounds (multiplications).

    :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers.
    :param counter_words: ``2`` or ``4``, number of integers generated in one go.
    :param rounds: ``1`` to ``12``, the more rounds, the better randomness is achieved.
        Default values are big enough to qualify as PRNG.
    :returns: a :py:class:`Bijection` object.
    """

    W_CONSTANTS = {
        64: [
            numpy.uint64(0x9E3779B97F4A7C15), # golden ratio
            numpy.uint64(0xBB67AE8584CAA73B) # sqrt(3)-1
        ],
        32: [
            numpy.uint32(0x9E3779B9), # golden ratio
            numpy.uint32(0xBB67AE85) # sqrt(3)-1
        ]
    }

    M_CONSTANTS = {
        (64,2): [numpy.uint64(0xD2B74407B1CE6E93)],
        (64,4): [numpy.uint64(0xD2E7470EE14C6C93), numpy.uint64(0xCA5A826395121157)],
        (32,2): [numpy.uint32(0xD256D193)],
        (32,4): [numpy.uint32(0xD2511F53), numpy.uint32(0xCD9E8D57)]
    }

    assert 1 <= rounds <= 12
    word_dtype = dtypes.normalize_type(numpy.uint32 if bitness == 32 else numpy.uint64)
    key_words = counter_words // 2
    key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types(
        word_dtype, key_words, counter_words)

    module = Module(
        TEMPLATE.get_def("philox"),
        render_kwds=dict(
            word_dtype=word_dtype, word_ctype=dtypes.ctype(word_dtype),
            key_words=key_words, counter_words=counter_words,
            key_ctype=key_ctype, counter_ctype=counter_ctype,
            rounds=rounds, w_constants=W_CONSTANTS[bitness],
            m_constants=M_CONSTANTS[(bitness, counter_words)]))

    return Bijection(module, word_dtype, key_dtype, counter_dtype)
示例#23
0
def check_kernel_sampler(thr, sampler, extent=None, mean=None, std=None):

    size = 10000
    batch = 100
    seed = 456

    bijection = sampler.bijection
    keygen = KeyGenerator.create(bijection, seed=seed)

    rng_kernel = thr.compile_static(
        """
        KERNEL void test(GLOBAL_MEM ${ctype} *dest, int ctr_start)
        {
            VIRTUAL_SKIP_THREADS;
            const VSIZE_T idx = virtual_global_id(0);

            ${bijection.module}Key key = ${keygen.module}key_from_int(idx);
            ${bijection.module}Counter ctr = ${bijection.module}make_counter_from_int(ctr_start);
            ${bijection.module}State st = ${bijection.module}make_state(key, ctr);

            ${sampler.module}Result res;

            for(int j = 0; j < ${batch}; j++)
            {
                res = ${sampler.module}sample(&st);

                %for i in range(sampler.randoms_per_call):
                dest[j * ${size * sampler.randoms_per_call} + ${size * i} + idx] = res.v[${i}];
                %endfor
            }

            ${bijection.module}Counter next_ctr = ${bijection.module}get_next_unused_counter(st);
        }
        """,
        'test', size,
        render_kwds=dict(
            size=size, batch=batch, ctype=dtypes.ctype(sampler.dtype),
            bijection=bijection, keygen=keygen, sampler=sampler))

    dest = thr.array((batch, sampler.randoms_per_call, size), sampler.dtype)
    rng_kernel(dest, numpy.int32(0))
    dest = dest.get()

    check_distribution(dest, extent=extent, mean=mean, std=std)
示例#24
0
def gamma(bijection, dtype, shape=1, scale=1):
    """
    Generates random numbers from the gamma distribution

    .. math::
      P(x) = x^{k-1} \\frac{e^{-x/\\theta}}{\\theta^k \\Gamma(k)},

    where :math:`k` is ``shape``, and :math:`\\theta` is ``scale``.
    Supported dtypes: ``float(32/64)``.
    Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object.
    """

    ctype = dtypes.ctype(dtype)
    uf = uniform_float(bijection, dtype, low=0, high=1)
    nbm = normal_bm(bijection, dtype, mean=0, std=1)

    module = Module(
        TEMPLATE.get_def("gamma"),
        render_kwds=dict(
            dtype=dtype, ctype=ctype, bijection=bijection,
            shape=shape, scale=dtypes.c_constant(scale, dtype),
            uf=uf, nbm=nbm))

    return Sampler(bijection, module, dtype)
示例#25
0
    def _build_plan(self, plan_factory, device_params, result, phase):
        plan = plan_factory()

        tr = Transformation([
            Parameter('result', Annotation(result, 'o')),
            Parameter('phase', Annotation(phase, 'i')),
        ],
                            """
            <%
                interv = 2**32 // mspace_size
                half_interv = interv // 2
            %>
            ${phase.ctype} phase = ${phase.load_same};
            ${result.store_same}(((unsigned int)phase + ${half_interv}) / ${interv});
            """,
                            render_kwds=dict(mspace_size=self._mspace_size,
                                             uint64=dtypes.ctype(
                                                 numpy.uint64)),
                            connectors=['result', 'phase'])

        plan.computation_call(
            PureParallel.from_trf(tr, guiding_array='result'), result, phase)

        return plan
示例#26
0
 def __init__(self, module):
     self.module = module
     self.u64 = dtypes.ctype(numpy.uint64)
     self.u32 = dtypes.ctype(numpy.uint32)
     self.modulus = dtypes.c_constant(2**64 - 2**32 + 1, numpy.uint64)
示例#27
0
def kspaceepanechnikov_filter_CL2(ksp, sigma):
    sz = ksp.shape
    dtype = np.complex64
    ftype = np.float32
    clear_first_arg_caches()
    fsiz = (5, 5, 5)
    print (np.ceil(sigma[0]) + 2,
           np.ceil(sigma[1]) + 2, np.ceil(sigma[2]) + 2)
    print sigma
    fsiz = (np.ceil(sigma) + 2).astype(int)
    for i in xrange(0, fsiz.size):
        if not fsiz[i] & 0x1:
            fsiz[i] += 1
    # Create image-domain Epanechikov kernel
    Kepa = epanechnikov_kernel(fsiz, sigma)
    # Place kernel at centre of ksp-sized matrix
    Kfilter = np.zeros(np.array(sz), dtype=np.complex64)
    szmin = np.floor(
        np.array(sz) / 2.0 - np.floor(np.array(Kepa.shape) / 2.0) - 1)
    szmax = np.floor(szmin + np.array(Kepa.shape))
    print "Epa filter size ", sz, " image filter ", Kepa.shape, " szmin ", szmin, " szmax ", szmax
    Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[1], szmin[2]:szmax[2]] = Kepa
    Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[
        1], szmin[2]:szmax[2]].imag = Kepa
    # Create fourier-domain Epanechnikov filter
    api = any_api()
    thr = api.Thread.create()
    data_dev = thr.to_device(Kfilter)
    rfft = FFT(data_dev)
    crfft = rfft.compile(thr)
    fftshift = FFTShift(data_dev)
    cfftshift = fftshift.compile(thr)
    crfft(data_dev, data_dev)
    thr.synchronize()
    cfftshift(data_dev, data_dev)
    Fepanechnikov = np.abs(data_dev.get())  # / np.prod(np.array(ksp.shape))
    #result2 = result2[::-1,::-1,::-1]
    thr.synchronize()
    #result = np.zeros(np.array(siz), dtype=np.complex64)
    #result.real = np.abs(result2) / np.sqrt(2)
    #result.imag = np.abs(result2) / np.sqrt(2)
    del data_dev, rfft, crfft, fftshift, cfftshift
    # Multiply Epanechnikov filter to real and imag ksp data
    program = thr.compile("""
KERNEL void multiply_them(
    GLOBAL_MEM ${ctype} *dest,
    GLOBAL_MEM ${ctype} *a,
    GLOBAL_MEM ${ftype} *f)
{
  const SIZE_T i = get_local_id(0);
  dest[i].x = a[i].x * f[i];
  dest[i].y = a[i].y * f[i];
}""", render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype)))

    data_dev = thr.to_device(ksp)
    filter_dev = thr.to_device(Fepanechnikov)
    multiply_them = program.multiply_them
    multiply_them(data_dev, data_dev, filter_dev, global_size=512 * 512 * 512)
    thr.synchronize()
    del filter_dev, program
    FACTOR = 1.0

    # Recon
    ifft = FFT(data_dev)
    cifft = ifft.compile(thr)
    fftshiftobj = FFTShift(data_dev)
    cfftshift = fftshiftobj.compile(thr)
    cifft(data_dev, data_dev, inverse=0)
    thr.synchronize()
    cfftshift(data_dev, data_dev)
    thr.synchronize()
    result2 = data_dev.get() / np.prod(np.array(ksp.shape))
    result2 = result2[::-1, ::-1, ::-1]
    thr.release()
    return result2
示例#28
0
def gaussian_fourierkernel(siz, sigma_):
    """
    Create Gaussian Fourier filter kernel with GPU
    """
    if not hasattr(sigma, "__len__"):  # type(sigma) is float:
        sigma = np.ones(3) * sigma_
    elif len(sigma) == 2:
        sigma[2] = 0.0

    sz = siz
    ctype = np.complex64
    ftype = np.float32
    #api = cluda.ocl_api()
    api = any_api()
    thr = api.Thread.create()
    base = np.ones(siz, ctype)
    data_dev = thr.to_device(base)
    FACTOR = 1.0
    program = thr.compile("""
KERNEL void gauss_kernel(
    GLOBAL_MEM ${ctype} *dest,
    GLOBAL_MEM ${ctype} *src)
{
  const ulong x = get_global_id(0);
  const SIZE_T dim1= %d;
  const SIZE_T dim2= %d;
  const SIZE_T dim3= %d;                    
  ${ftype} sigma[3];
  sigma[0]=%f;sigma[1]=%f;sigma[2]=%f;
  ${ftype} factor = %f;            
  const double TWOPISQ = 19.739208802178716; //6.283185307179586;  //2*3.141592;
  const ${ftype} SQRT2PI = 2.5066282746;
  const double CUBEDSQRT2PI = 15.749609945722419;
  const ulong idx = x;
  ${ftype} i = (${ftype})((x / dim3) / dim2);
      i = (i - (${ftype})floor((${ftype})(dim1)/2.0))/(${ftype})(dim1);
  ${ftype} j = (${ftype})(x / dim3);
      if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);};
      j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2);
  //Account for large global index (stored as ulong) before performing modulus
  double pre_k=fmod((double)(x) , (double) dim3);
  ${ftype} k = (${ftype}) pre_k;
      k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3);

  ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1;
  //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  dest[idx].x = src[idx].x * weight;
  dest[idx].y = src[idx].y * weight; 
  
}
""" % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR),
        render_kwds=dict(ctype=dtypes.ctype(ctype),
                         ftype=dtypes.ctype(ftype),
                         exp=functions.exp(ftype)), fast_math=True)
    gauss_kernel = program.gauss_kernel
    #data_dev = thr.empty_like(ksp_dev)
    gauss_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2])
    gfilter = data_dev.get()
    thr.synchronize()
    thr.release()

    return gfilter
示例#29
0
def kspacegaussian_filter_CL2(ksp, sigma):
    """ Kspace gaussian filter and recon using GPU OpenCL

    1. GPU intialisation
    2. push KSP complex matrix to GPU
    3. declare FFT program
    4. declare Complex Gaussian GPU filter program
    5. Execute Gaussian GPU program
    6. GPU sync
    7. Execute FFT Recon
    8. Execute FFTshift
    9. Retrieve reconstruced complex image from GPU
    10. Reorganise image to standard (mimic numpy format)

    """
    sz = ksp.shape
    dtype = np.complex64
    ftype = np.float32
    ultype = np.uint64
    #api = cluda.ocl_api()
    api = any_api()
    thr = api.Thread.create()
    data_dev = thr.to_device(ksp)
    ifft = FFT(data_dev)
    FACTOR = 1.0
    program = thr.compile("""
KERNEL void gauss_kernel(
    GLOBAL_MEM ${ctype} *dest,
    GLOBAL_MEM ${ctype} *src)
{
  const ulong x = get_global_id(0);
  const SIZE_T dim1= %d;
  const SIZE_T dim2= %d;
  const SIZE_T dim3= %d;                    
  ${ftype} sigma[3];
  sigma[0]=%f;sigma[1]=%f;sigma[2]=%f;
  ${ftype} factor = %f;            
  const double TWOPISQ = 19.739208802178716; //6.283185307179586;  //2*3.141592;
  const ${ftype} SQRT2PI = 2.5066282746;
  const double CUBEDSQRT2PI = 15.749609945722419;
  const ulong idx = x;
  ${ftype} i = (${ftype})((x / dim3) / dim2);
      i = (i - (${ftype})floor((${ftype})(dim1)/2.0f))/(${ftype})(dim1);
  ${ftype} j = (${ftype})(x / dim3);
      if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);};
      j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2);
  // Account for large global index (stored as ulong) before performing modulus
  double pre_k=fmod((double)(x), (double)dim3);
  ${ftype} k = (${ftype}) pre_k;
      k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3);

  ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  // ${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1;
  // ${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  dest[idx].x = src[idx].x * weight;
  dest[idx].y = src[idx].y * weight; 
  
}
""" % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR),
        render_kwds=dict(ctype=dtypes.ctype(dtype),
                         ftype=dtypes.ctype(ftype),
                         exp=functions.exp(ftype)), fast_math=True)
    gauss_kernel = program.gauss_kernel
    gauss_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2])
    thr.synchronize()
    # Recon
    #data_dev = thr.to_device(ksp)
    ifftobj = FFT(data_dev)
    cifft = ifftobj.compile(thr)
    fftshiftobj = FFTShift(data_dev)
    cfftshift = fftshiftobj.compile(thr)
    cifft(data_dev, data_dev, inverse=0)
    thr.synchronize()
    cfftshift(data_dev, data_dev)
    thr.synchronize()
    result2 = data_dev.get() / np.prod(np.array(ksp.shape))
    result2 = result2[::-1, ::-1, ::-1]
    thr.release()
    return result2
示例#30
0
      if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);};
      j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2);
  //Account for large global index (stored as ulong) before performing modulus
  double pre_k=fmod((double)(x) , (double) dim3);
  ${ftype} k = (${ftype}) pre_k;
      k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3);

  ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1;
  //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  
  dest[idx].x = src[idx].x * weight;
  dest[idx].y = src[idx].y * weight;  //(${ftype})k; //
  
}
""" % (N, N, N, sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype),
                                                                        ftype=dtypes.ctype(
                                                                            ftype),
                                                                        exp=functions.exp(ftype)), fast_math=True)

gauss_kernel = program.gauss_kernel

r1 = np.ones((N, N, N)).astype(ftype)  # /N
r2 = np.ones((N, N, N)).astype(ftype)  # /N
a = r1 + 1j * r2
b = r1 - 1j * r2
a_dev = thr.to_device(a)
#b_dev = thr.to_device(b)
#c_dev= thr.to_device(b.ravel())
#sigma_dev = thr.to_device(sigma)
dest_dev = thr.empty_like(a_dev)
示例#31
0
def threefry(bitness, counter_words, rounds=20):
    """
    A CBRNG based on a big number of fast rounds (bit rotations).

    :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers.
    :param counter_words: ``2`` or ``4``, number of integers generated in one go.
    :param rounds: ``1`` to ``72``, the more rounds, the better randomness is achieved.
        Default values are big enough to qualify as PRNG.
    :returns: a :py:class:`Bijection` object.
    """

    ROTATION_CONSTANTS = {
        # These are the R_256 constants from the Threefish reference sources
        # with names changed to R_64x4...
        (64, 4): numpy.array([[14, 52, 23, 5, 25, 46, 58, 32], [16, 57, 40, 37, 33, 12, 22, 32]]).T,

        # Output from skein_rot_search: (srs64_B64-X1000)
        # Random seed = 1. BlockSize = 128 bits. sampleCnt =  1024. rounds =  8, minHW_or=57
        # Start: Tue Mar  1 10:07:48 2011
        # rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
        (64, 2): numpy.array([[16, 42, 12, 31, 16, 32, 24, 21]]).T,
        # 4 rounds: minHW =  4  [  4  4  4  4 ]
        # 5 rounds: minHW =  8  [  8  8  8  8 ]
        # 6 rounds: minHW = 16  [ 16 16 16 16 ]
        # 7 rounds: minHW = 32  [ 32 32 32 32 ]
        # 8 rounds: minHW = 64  [ 64 64 64 64 ]
        # 9 rounds: minHW = 64  [ 64 64 64 64 ]
        # 10 rounds: minHW = 64  [ 64 64 64 64 ]
        # 11 rounds: minHW = 64  [ 64 64 64 64 ]

        # Output from skein_rot_search: (srs-B128-X5000.out)
        # Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
        # Start: Mon Aug 24 22:41:36 2009
        # ...
        # rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format
        (32, 4): numpy.array([[10, 11, 13, 23, 6, 17, 25, 18], [26, 21, 27, 5, 20, 11, 10, 20]]).T,
        # 4 rounds: minHW =  3  [  3  3  3  3 ]
        # 5 rounds: minHW =  7  [  7  7  7  7 ]
        # 6 rounds: minHW = 12  [ 13 12 13 12 ]
        # 7 rounds: minHW = 22  [ 22 23 22 23 ]
        # 8 rounds: minHW = 31  [ 31 31 31 31 ]
        # 9 rounds: minHW = 32  [ 32 32 32 32 ]
        # 10 rounds: minHW = 32  [ 32 32 32 32 ]
        # 11 rounds: minHW = 32  [ 32 32 32 32 ]

        # Output from skein_rot_search (srs32x2-X5000.out)
        # Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
        # Start: Tue Jul 12 11:11:33 2011
        # rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize=  64].format
        (32, 2): numpy.array([[13, 15, 26, 6, 17, 29, 16, 24]]).T
        # 4 rounds: minHW =  4  [  4  4  4  4 ]
        # 5 rounds: minHW =  6  [  6  8  6  8 ]
        # 6 rounds: minHW =  9  [  9 12  9 12 ]
        # 7 rounds: minHW = 16  [ 16 24 16 24 ]
        # 8 rounds: minHW = 32  [ 32 32 32 32 ]
        # 9 rounds: minHW = 32  [ 32 32 32 32 ]
        # 10 rounds: minHW = 32  [ 32 32 32 32 ]
        # 11 rounds: minHW = 32  [ 32 32 32 32 ]
    }

    # Taken from Skein
    PARITY_CONSTANTS = {
        64: numpy.uint64(0x1BD11BDAA9FC1A22),
        32: numpy.uint32(0x1BD11BDA)
    }

    assert 1 <= rounds <= 72

    word_dtype = dtypes.normalize_type(numpy.uint32 if bitness == 32 else numpy.uint64)
    key_words = counter_words
    key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types(
        word_dtype, key_words, counter_words)

    module = Module(
        TEMPLATE.get_def("threefry"),
        render_kwds=dict(
            word_dtype=word_dtype, word_ctype=dtypes.ctype(word_dtype),
            key_words=key_words, counter_words=counter_words,
            key_ctype=key_ctype, counter_ctype=counter_ctype,
            rounds=rounds, rotation_constants=ROTATION_CONSTANTS[(bitness, counter_words)],
            parity_constant=PARITY_CONSTANTS[bitness]))

    return Bijection(module, word_dtype, key_dtype, counter_dtype)
def kspaceepanechnikov_filter(ksp, sigma):
    """ Kspace gaussian filter and recon using GPU OpenCL

    1. GPU intialisation
    2. push KSP complex matrix to GPU
    3. declare FFT program
    4. declare Complex Epan GPU filter program
    5. Execute Epan GPU program
    6. GPU sync
    7. Execute FFT Recon
    8. Execute FFTshift
    9. Retrieve reconstruced complex image from GPU
    10. Reorganise image to standard (mimic numpy format)

    """
    sz = ksp.shape
    dtype = np.complex64
    ftype = np.float32
    ultype = np.uint64
    #api = cluda.ocl_api()
    api = any_api()
    thr = api.Thread.create()
    data_dev = thr.to_device(ksp)
    ifft = FFT(data_dev)
    FACTOR = 1.0
    program = thr.compile("""
KERNEL void epan_kernel(
    GLOBAL_MEM ${ctype} *dest,
    GLOBAL_MEM ${ctype} *src)
{
  const ulong x = get_global_id(0);
  const SIZE_T dim1= %d;
  const SIZE_T dim2= %d;
  const SIZE_T dim3= %d;                    
  ${ftype} sigma[3];
  sigma[0]=%f;sigma[1]=%f;sigma[2]=%f;
  ${ftype} factor = %f;            
  const double TWOPISQ = 19.739208802178716; //6.283185307179586;  //2*3.141592;
  const ${ftype} SQRT2PI = 2.5066282746;
  const double CUBEDSQRT2PI = 15.749609945722419;
  const ulong idx = x;
  ${ftype} i = (${ftype})((x / dim3) / dim2);
      i = (i - (${ftype})floor((${ftype})(dim1)/2.0f))/(${ftype})(dim1);
  ${ftype} j = (${ftype})(x / dim3);
      if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);};
      j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2);
  // Account for large global index (stored as ulong) before performing modulus
  double pre_k=fmod((double)(x), (double)dim3);
  ${ftype} k = (${ftype}) pre_k;
      k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3);
  ${ftype} omega = (i*sigma[0]+j*sigma[1]+k*sigma[2]);
  ${ftype} omega3 = ((i*sigma[0])*(i*sigma[0])*(i*sigma[0])+(j*sigma[1])*(j*sigma[1])*(j*sigma[1])+(k*sigma[2])*(k*sigma[2])*(k*sigma[2]));        
  ${ftype} weight = 0.423142 * fabs((4 * sin(omega) - 4 * omega * cos(omega)) / omega3);
  dest[idx].x = src[idx].x * weight; 
  dest[idx].y = src[idx].y * weight; 
  
}
""" % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR),
                          render_kwds=dict(ctype=dtypes.ctype(dtype),
                                           ftype=dtypes.ctype(ftype)),
                          fast_math=True)
    epan_kernel = program.epan_kernel
    #data_dev = thr.empty_like(ksp_dev)
    epan_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2])
    return data_dev()
示例#33
0
def test_tempalloc(cluda_api, tempalloc_cls, pack):

    shape = (10000,)
    dtype = numpy.int32
    thr = cluda_api.Thread.create(temp_alloc=dict(
        cls=tempalloc_cls, pack_on_alloc=False))

    # Dependency graph for the test
    dependencies = dict(
        _temp0=[],
        _temp1=['_temp9', '_temp8', '_temp3', '_temp5', '_temp4', '_temp7', '_temp6', 'input'],
        _temp10=['output', '_temp7'],
        _temp11=['_temp7'],
        _temp2=['input'],
        _temp3=['_temp1', 'input'],
        _temp4=['_temp9', '_temp8', '_temp1', '_temp7', '_temp6'],
        _temp5=['_temp1'],
        _temp6=['_temp1', '_temp4'],
        _temp7=['_temp9', '_temp1', '_temp4', 'output', '_temp11', '_temp10'],
        _temp8=['_temp1', '_temp4'],
        _temp9=['_temp1', '_temp4', '_temp7'],
        input=['_temp1', '_temp3', '_temp2'],
        output=['_temp10', '_temp7'])

    program = thr.compile(
    """
    KERNEL void fill(GLOBAL_MEM ${ctype} *dest, ${ctype} val)
    {
      const SIZE_T i = get_global_id(0);
      dest[i] = val;
    }

    KERNEL void transfer(GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src)
    {
      const SIZE_T i = get_global_id(0);
      dest[i] = src[i];
    }
    """, render_kwds=dict(ctype=dtypes.ctype(dtype)))
    fill = program.fill
    transfer = program.transfer

    arrays = {}
    transfer_dest = thr.array(shape, dtype)

    # Allocate temporary arrays with dependencies
    for name in sorted(dependencies.keys()):
        deps = dependencies[name]
        arr_deps = [arrays[d] for d in deps if d in arrays]
        arrays[name] = thr.temp_array(shape, dtype, dependencies=arr_deps)
        fill(arrays[name], dtype(0), global_size=shape)

    if pack:
        thr.temp_alloc.pack()

    # Fill arrays with zeros
    for name in sorted(dependencies.keys()):
        deps = dependencies[name]
        arr_deps = [arrays[d] for d in deps if d in arrays]
        arrays[name] = thr.temp_array(shape, dtype, dependencies=arr_deps)
        fill(arrays[name], dtype(0), global_size=shape)

    for i, name in enumerate(sorted(dependencies.keys())):
        val = dtype(i + 1)
        fill(arrays[name], val, global_size=shape)
        for dep in dependencies[name]:
            # CUDA does not support get() for GPUArray with custom buffers,
            # So we need to transfer the data to a normal array first.
            transfer(transfer_dest, arrays[dep], global_size=shape)
            assert (transfer_dest.get() != val).all()
示例#34
0
def test_tempalloc(cluda_api, tempalloc_cls, pack):

    shape = (10000, )
    dtype = numpy.int32
    thr = cluda_api.Thread.create(
        temp_alloc=dict(cls=tempalloc_cls, pack_on_alloc=False))

    # Dependency graph for the test
    dependencies = dict(
        _temp0=[],
        _temp1=[
            '_temp9', '_temp8', '_temp3', '_temp5', '_temp4', '_temp7',
            '_temp6', 'input'
        ],
        _temp10=['output', '_temp7'],
        _temp11=['_temp7'],
        _temp2=['input'],
        _temp3=['_temp1', 'input'],
        _temp4=['_temp9', '_temp8', '_temp1', '_temp7', '_temp6'],
        _temp5=['_temp1'],
        _temp6=['_temp1', '_temp4'],
        _temp7=['_temp9', '_temp1', '_temp4', 'output', '_temp11', '_temp10'],
        _temp8=['_temp1', '_temp4'],
        _temp9=['_temp1', '_temp4', '_temp7'],
        input=['_temp1', '_temp3', '_temp2'],
        output=['_temp10', '_temp7'])

    program = thr.compile("""
    KERNEL void fill(GLOBAL_MEM ${ctype} *dest, ${ctype} val)
    {
      const SIZE_T i = get_global_id(0);
      dest[i] = val;
    }

    KERNEL void transfer(GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src)
    {
      const SIZE_T i = get_global_id(0);
      dest[i] = src[i];
    }
    """,
                          render_kwds=dict(ctype=dtypes.ctype(dtype)))
    fill = program.fill
    transfer = program.transfer

    arrays = {}
    transfer_dest = thr.array(shape, dtype)

    # Allocate temporary arrays with dependencies
    for name in sorted(dependencies.keys()):
        deps = dependencies[name]
        arr_deps = [arrays[d] for d in deps if d in arrays]
        arrays[name] = thr.temp_array(shape, dtype, dependencies=arr_deps)
        fill(arrays[name], dtype(0), global_size=shape)

    if pack:
        thr.temp_alloc.pack()

    # Fill arrays with zeros
    for name in sorted(dependencies.keys()):
        deps = dependencies[name]
        arr_deps = [arrays[d] for d in deps if d in arrays]
        arrays[name] = thr.temp_array(shape, dtype, dependencies=arr_deps)
        fill(arrays[name], dtype(0), global_size=shape)

    for i, name in enumerate(sorted(dependencies.keys())):
        val = dtype(i + 1)
        fill(arrays[name], val, global_size=shape)
        for dep in dependencies[name]:
            # CUDA does not support get() for GPUArray with custom buffers,
            # So we need to transfer the data to a normal array first.
            transfer(transfer_dest, arrays[dep], global_size=shape)
            assert (transfer_dest.get() != val).all()
def kspacegaussian_filter_CL2(ksp, sigma):
    sz = ksp.shape
    dtype = np.complex64
    ftype = np.float32
    #api = cluda.ocl_api()
    api = cuda_api()
    thr = api.Thread.create()
    data_dev = thr.to_device(ksp)
    ifft = FFT(data_dev)
    FACTOR = 1.0
    program = thr.compile("""
KERNEL void gauss_kernel(
    GLOBAL_MEM ${ctype} *dest,
    GLOBAL_MEM ${ctype} *src)
{
  const ulong x = get_global_id(0);
  const SIZE_T dim1= %d;
  const SIZE_T dim2= %d;
  const SIZE_T dim3= %d;                    
  ${ftype} sigma[3];
  sigma[0]=%f;sigma[1]=%f;sigma[2]=%f;
  ${ftype} factor = %f;            
  const double TWOPISQ = 19.739208802178716; //6.283185307179586;  //2*3.141592;
//  const ${ftype} SQRT2PI = 2.5066282746;
//  const double CUBEDSQRT2PI = 15.749609945722419;
  const ulong idx = x;
  ${ftype} i = (${ftype})((x / dim3) / dim2);
      i = (i - (${ftype})floor((${ftype})(dim1)/2.0))/(${ftype})(dim1);
  ${ftype} j = (${ftype})(x / dim3);
      if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);};
      j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2);
  //Account for large global index (stored as ulong) before performing modulus
  double pre_k=fmod((double)(x) , (double) dim3);
  ${ftype} k = (${ftype}) pre_k;
      k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3);

  ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1;
  //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  dest[idx].x = src[idx].x * weight * factor;
  dest[idx].y = src[idx].y * weight * factor; 
  
}
""" % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR),
        render_kwds=dict(ctype=dtypes.ctype(dtype),
                         ftype=dtypes.ctype(ftype),
                         exp=functions.exp(ftype)), fast_math=True)
    gauss_kernel = program.gauss_kernel
    #data_dev = thr.empty_like(ksp_dev)
    gauss_kernel(data_dev, data_dev, global_size=(sz[0], sz[1], sz[2]))

    thr.synchronize()
    #data_dev = thr.to_device(ksp)
    ifft = FFT(data_dev)
    cifft = ifft.compile(thr)
    fftshift = FFTShift(data_dev)
    cfftshift = fftshift.compile(thr)
    cifft(data_dev, data_dev, inverse=0)
    thr.synchronize()
    cfftshift(data_dev, data_dev)
    thr.synchronize()
    result2 = data_dev.get() / np.prod(np.array(ksp.shape))
    result2 = result2[::-1, ::-1, ::-1]
    thr.release()
    return result2
示例#36
0
def threefry(bitness, counter_words, rounds=20):
    """
    A CBRNG based on a big number of fast rounds (bit rotations).

    :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers.
    :param counter_words: ``2`` or ``4``, number of integers generated in one go.
    :param rounds: ``1`` to ``72``, the more rounds, the better randomness is achieved.
        Default values are big enough to qualify as PRNG.
    :returns: a :py:class:`Bijection` object.
    """

    ROTATION_CONSTANTS = {
        # These are the R_256 constants from the Threefish reference sources
        # with names changed to R_64x4...
        (64, 4):
        numpy.array([[14, 52, 23, 5, 25, 46, 58, 32],
                     [16, 57, 40, 37, 33, 12, 22, 32]]).T,

        # Output from skein_rot_search: (srs64_B64-X1000)
        # Random seed = 1. BlockSize = 128 bits. sampleCnt =  1024. rounds =  8, minHW_or=57
        # Start: Tue Mar  1 10:07:48 2011
        # rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
        (64, 2):
        numpy.array([[16, 42, 12, 31, 16, 32, 24, 21]]).T,
        # 4 rounds: minHW =  4  [  4  4  4  4 ]
        # 5 rounds: minHW =  8  [  8  8  8  8 ]
        # 6 rounds: minHW = 16  [ 16 16 16 16 ]
        # 7 rounds: minHW = 32  [ 32 32 32 32 ]
        # 8 rounds: minHW = 64  [ 64 64 64 64 ]
        # 9 rounds: minHW = 64  [ 64 64 64 64 ]
        # 10 rounds: minHW = 64  [ 64 64 64 64 ]
        # 11 rounds: minHW = 64  [ 64 64 64 64 ]

        # Output from skein_rot_search: (srs-B128-X5000.out)
        # Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
        # Start: Mon Aug 24 22:41:36 2009
        # ...
        # rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format
        (32, 4):
        numpy.array([[10, 11, 13, 23, 6, 17, 25, 18],
                     [26, 21, 27, 5, 20, 11, 10, 20]]).T,
        # 4 rounds: minHW =  3  [  3  3  3  3 ]
        # 5 rounds: minHW =  7  [  7  7  7  7 ]
        # 6 rounds: minHW = 12  [ 13 12 13 12 ]
        # 7 rounds: minHW = 22  [ 22 23 22 23 ]
        # 8 rounds: minHW = 31  [ 31 31 31 31 ]
        # 9 rounds: minHW = 32  [ 32 32 32 32 ]
        # 10 rounds: minHW = 32  [ 32 32 32 32 ]
        # 11 rounds: minHW = 32  [ 32 32 32 32 ]

        # Output from skein_rot_search (srs32x2-X5000.out)
        # Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
        # Start: Tue Jul 12 11:11:33 2011
        # rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize=  64].format
        (32, 2):
        numpy.array([[13, 15, 26, 6, 17, 29, 16, 24]]).T
        # 4 rounds: minHW =  4  [  4  4  4  4 ]
        # 5 rounds: minHW =  6  [  6  8  6  8 ]
        # 6 rounds: minHW =  9  [  9 12  9 12 ]
        # 7 rounds: minHW = 16  [ 16 24 16 24 ]
        # 8 rounds: minHW = 32  [ 32 32 32 32 ]
        # 9 rounds: minHW = 32  [ 32 32 32 32 ]
        # 10 rounds: minHW = 32  [ 32 32 32 32 ]
        # 11 rounds: minHW = 32  [ 32 32 32 32 ]
    }

    # Taken from Skein
    PARITY_CONSTANTS = {
        64: numpy.uint64(0x1BD11BDAA9FC1A22),
        32: numpy.uint32(0x1BD11BDA)
    }

    assert 1 <= rounds <= 72

    word_dtype = dtypes.normalize_type(numpy.uint32 if bitness ==
                                       32 else numpy.uint64)
    key_words = counter_words
    key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types(
        word_dtype, key_words, counter_words)

    module = Module(TEMPLATE.get_def("threefry"),
                    render_kwds=dict(
                        word_dtype=word_dtype,
                        word_ctype=dtypes.ctype(word_dtype),
                        key_words=key_words,
                        counter_words=counter_words,
                        key_ctype=key_ctype,
                        counter_ctype=counter_ctype,
                        rounds=rounds,
                        rotation_constants=ROTATION_CONSTANTS[(bitness,
                                                               counter_words)],
                        parity_constant=PARITY_CONSTANTS[bitness]))

    return Bijection(module, word_dtype, key_dtype, counter_dtype)
示例#37
0
def get_ff_elem():
    module = Module(TEMPLATE.get_def('ff_elem_def'),
                    render_kwds=dict(u64=dtypes.ctype(numpy.uint64)))
    return FiniteFieldElement(module)