Python mul示例，reikna.cluda.functions.mul Python示例

示例#1

0

显示文件

文件： helpers.py 项目： fjarri/reikna-integrator

def get_kprop_trf(state_arr, ksquared_arr, coeffs, exp=False):
    compound_dtype = dtypes.result_type(coeffs.dtype, ksquared_arr.dtype)
    return Transformation(
        [
            Parameter('output', Annotation(state_arr, 'o')),
            Parameter('input', Annotation(state_arr, 'i')),
            Parameter('ksquared', Annotation(ksquared_arr, 'i')),
            Parameter('dt', Annotation(ksquared_arr.dtype))],
        """
        %if max(coeffs.values) > 0:
        ${ksquared.ctype} ksquared = ${ksquared.load_idx}(${', '.join(idxs[2:])});
        %endif

        ${dtypes.ctype(compound_dtype)} compound_coeff = ${dtypes.c_constant(0, compound_dtype)};

        %for pwr, values in coeffs.values.items():
        {
            ${dtypes.ctype(coeffs.dtype)} value;

            %for comp in range(output.shape[1]):
            ${'if' if comp == 0 else 'else if'} (${idxs[1]} == ${comp})
            {
                value = ${dtypes.c_constant(values[comp], coeffs.dtype)};
            }
            %endfor

            compound_coeff =
                compound_coeff
                + ${mul_kc}(
                    %if pwr == 0:
                    ${dt}
                    %elif pwr == 2:
                    -ksquared * ${dt}
                    %else:
                    pow(-ksquared, ${pwr // 2}) * ${dt}
                    %endif
                    ,
                    value
                    );
        }
        %endfor

        ${output.store_same}(${mul_ic}(
            ${input.load_same},
            %if exp is not None:
            ${exp}(compound_coeff)
            %else:
            compound_coeff
            %endif
            ));
        """,
        render_kwds=dict(
            coeffs=coeffs,
            compound_dtype=compound_dtype,
            mul_ic=functions.mul(state_arr.dtype, compound_dtype, out_dtype=state_arr.dtype),
            mul_kc=functions.mul(ksquared_arr.dtype, coeffs.dtype, out_dtype=compound_dtype),
            exp=functions.exp(compound_dtype) if exp else None))

示例#2

0

显示文件

文件： rk4ip_stepper.py 项目： fjarri/reikna-integrator

def get_nonlinear_wrapper(state_dtype, grid_dims, drift, diffusion=None):

    real_dtype = dtypes.real_for(state_dtype)
    if diffusion is not None:
        noise_dtype = diffusion.dtype
    else:
        noise_dtype = real_dtype

    return Module.create(
        """
        <%
            components = drift.components
            idx_args = ["idx_" + str(dim) for dim in range(grid_dims)]
            psi_args = ["psi_" + str(comp) for comp in range(components)]
            if diffusion is not None:
                dW_args = ["dW_" + str(ncomp) for ncomp in range(diffusion.noise_sources)]
        %>
        %for comp in range(components):
        INLINE WITHIN_KERNEL ${s_ctype} ${prefix}${comp}(
            %for idx in idx_args:
            const int ${idx},
            %endfor
            %for psi in psi_args:
            const ${s_ctype} ${psi},
            %endfor
            %if diffusion is not None:
            %for dW in dW_args:
            const ${n_ctype} ${dW},
            %endfor
            %endif
            const ${r_ctype} t,
            const ${r_ctype} dt)
        {
            return
                ${mul_sr}(${drift.module}${comp}(
                    ${", ".join(idx_args)}, ${", ".join(psi_args)}, t), dt)
                %if diffusion is not None:
                %for ncomp in range(diffusion.noise_sources):
                + ${mul_sn}(${diffusion.module}${comp}_${ncomp}(
                    ${", ".join(idx_args)}, ${", ".join(psi_args)}, t), ${dW_args[ncomp]})
                %endfor
                %endif
                ;
        }
        %endfor
        """,
        render_kwds=dict(
            grid_dims=grid_dims,
            s_ctype=dtypes.ctype(state_dtype),
            r_ctype=dtypes.ctype(real_dtype),
            n_ctype=dtypes.ctype(noise_dtype),
            mul_sr=functions.mul(state_dtype, real_dtype),
            mul_sn=functions.mul(state_dtype, noise_dtype),
            drift=drift,
            diffusion=diffusion))

示例#3

0

显示文件

def hanning_window(arr, NFFT):
    """
    Applies the von Hann window to the rows of a 2D array.
    To account for zero padding (which we do not want to window), NFFT is provided separately.
    """
    if dtypes.is_complex(arr.dtype):
        coeff_dtype = dtypes.real_for(arr.dtype)
    else:
        coeff_dtype = arr.dtype
    return Transformation([
        Parameter('output', Annotation(arr, 'o')),
        Parameter('input', Annotation(arr, 'i')),
    ],
                          """
        ${dtypes.ctype(coeff_dtype)} coeff;
        %if NFFT != output.shape[0]:
        if (${idxs[1]} >= ${NFFT})
        {
            coeff = 1;
        }
        else
        %endif
        {
            coeff = 0.5 * (1 - cos(2 * ${numpy.pi} * ${idxs[-1]} / (${NFFT} - 1)));
        }
        ${output.store_same}(${mul}(${input.load_same}, coeff));
        """,
                          render_kwds=dict(coeff_dtype=coeff_dtype,
                                           NFFT=NFFT,
                                           mul=functions.mul(
                                               arr.dtype, coeff_dtype)))

示例#4

0

显示文件

文件： generate_gpu.py 项目： fjarri/squeezed-sim

    def _build_plan(self, plan_factory, device_params, alpha, beta, seed):
        plan = plan_factory()

        bijection = philox(64, 2)

        # Keeping the kernel the same so it can be cached.
        # The seed will be passed as the computation parameter instead.
        keygen = KeyGenerator.create(bijection, seed=numpy.int32(0))

        sampler = normal_bm(bijection, numpy.float64)

        squeezing = plan.persistent_array(self._system.squeezing)
        decoherence = plan.persistent_array(self._system.decoherence)

        plan.kernel_call(TEMPLATE.get_def("generate_input_state"),
                         [alpha, beta, squeezing, decoherence, seed],
                         kernel_name="generate",
                         global_size=alpha.shape,
                         render_kwds=dict(
                             system=self._system,
                             representation=self._representation,
                             Representation=Representation,
                             bijection=bijection,
                             keygen=keygen,
                             sampler=sampler,
                             ordering=ordering,
                             exp=functions.exp(numpy.float64),
                             mul_cr=functions.mul(numpy.complex128,
                                                  numpy.float64),
                             add_cc=functions.add(numpy.complex128,
                                                  numpy.complex128),
                         ))

        return plan

示例#5

0

显示文件

def test_dtype_support(thr, dtype):
    # Test passes if either thread correctly reports that it does not support given dtype,
    # or it successfully compiles kernel that operates with this dtype.

    N = 256

    if not thr.device_params.supports_dtype(dtype):
        pytest.skip()

    mul = functions.mul(dtype, dtype)
    div = functions.div(dtype, dtype)
    program = thr.compile(
    """
    KERNEL void test(
        GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *a, GLOBAL_MEM ${ctype} *b)
    {
      const SIZE_T i = get_global_id(0);
      ${ctype} temp = ${mul}(a[i], b[i]);
      dest[i] = ${div}(temp, b[i]);
    }
    """, render_kwds=dict(ctype=dtypes.ctype(dtype), dtype=dtype, mul=mul, div=div))

    test = program.test

    # we need results to fit even in unsigned char
    a = get_test_array(N, dtype, high=8)
    b = get_test_array(N, dtype, no_zeros=True, high=8)

    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    dest_dev = thr.empty_like(a_dev)
    test(dest_dev, a_dev, b_dev, global_size=N)
    assert diff_is_negligible(thr.from_device(dest_dev), a)

示例#6

0

显示文件

文件： test_basics.py 项目： mgolub2/reikna

def test_dtype_support(thr, dtype):
    # Test passes if either thread correctly reports that it does not support given dtype,
    # or it successfully compiles kernel that operates with this dtype.

    N = 256

    if not thr.device_params.supports_dtype(dtype):
        pytest.skip()

    mul = functions.mul(dtype, dtype)
    div = functions.div(dtype, dtype)
    program = thr.compile(
    """
    KERNEL void test(
        GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *a, GLOBAL_MEM ${ctype} *b)
    {
      const SIZE_T i = get_global_id(0);
      ${ctype} temp = ${mul}(a[i], b[i]);
      dest[i] = ${div}(temp, b[i]);
    }
    """, render_kwds=dict(ctype=dtypes.ctype(dtype), dtype=dtype, mul=mul, div=div))

    test = program.test

    # we need results to fit even in unsigned char
    a = get_test_array(N, dtype, high=8)
    b = get_test_array(N, dtype, no_zeros=True, high=8)

    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    dest_dev = thr.empty_like(a_dev)
    test(dest_dev, a_dev, b_dev, global_size=N)
    assert diff_is_negligible(thr.from_device(dest_dev), a)

示例#7

0

显示文件

文件： lwe_gpu.py 项目： DucaturFw/nufhe

    def _build_plan(self, plan_factory, device_params, output, matrix, vector):
        plan = plan_factory()

        summation = Reduce(matrix,
                           predicate_sum(matrix.dtype),
                           axes=(len(matrix.shape) - 1, ))

        mul_vec = Transformation(
            [
                Parameter('output', Annotation(matrix, 'o')),
                Parameter('matrix', Annotation(matrix, 'i')),
                Parameter('vector', Annotation(vector, 'i'))
            ],
            """
            ${output.store_same}(${mul}(${matrix.load_same}, ${vector.load_idx}(${idxs[-1]})));
            """,
            render_kwds=dict(mul=functions.mul(matrix.dtype, vector.dtype)),
            connectors=['output', 'matrix'])

        summation.parameter.input.connect(mul_vec,
                                          mul_vec.output,
                                          matrix=mul_vec.matrix,
                                          vector=mul_vec.vector)

        plan.computation_call(summation, output, matrix, vector)

        return plan

示例#8

0

显示文件

    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        for_reduction = Type(numpy.float64, alpha.shape)

        meter_trf = Transformation([
            Parameter('output', Annotation(for_reduction, 'o')),
            Parameter('alpha', Annotation(alpha, 'i')),
            Parameter('beta', Annotation(beta, 'i')),
            ],
            """
                ${alpha.ctype} alpha = ${alpha.load_same};
                ${beta.ctype} beta = ${beta.load_same};
                ${alpha.ctype} t = ${mul_cc}(alpha, beta);
                ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y));
                ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y);
                ${output.store_same}(cp.x);
                """,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                exp_c=functions.exp(alpha.dtype),
                ))

        reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,))
        reduction.parameter.input.connect(
            meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta)

        plan.computation_call(reduction, output, alpha, beta)

        return plan

示例#9

0

显示文件

文件： demo_specgram.py 项目： fjarri/reikna

def hanning_window(arr, NFFT):
    """
    Applies the von Hann window to the rows of a 2D array.
    To account for zero padding (which we do not want to window), NFFT is provided separately.
    """
    if dtypes.is_complex(arr.dtype):
        coeff_dtype = dtypes.real_for(arr.dtype)
    else:
        coeff_dtype = arr.dtype
    return Transformation(
        [
            Parameter('output', Annotation(arr, 'o')),
            Parameter('input', Annotation(arr, 'i')),
        ],
        """
        ${dtypes.ctype(coeff_dtype)} coeff;
        %if NFFT != output.shape[0]:
        if (${idxs[1]} >= ${NFFT})
        {
            coeff = 1;
        }
        else
        %endif
        {
            coeff = 0.5 * (1 - cos(2 * ${numpy.pi} * ${idxs[-1]} / (${NFFT} - 1)));
        }
        ${output.store_same}(${mul}(${input.load_same}, coeff));
        """,
        render_kwds=dict(
            coeff_dtype=coeff_dtype, NFFT=NFFT,
            mul=functions.mul(arr.dtype, coeff_dtype)))

示例#10

0

显示文件

文件： soliton.py 项目： fjarri/reikna-integrator

def get_drift(state_dtype, U, gamma, dx, wigner=False):
    return Drift(
        Module.create(
            """
            <%
                r_dtype = dtypes.real_for(s_dtype)
                s_ctype = dtypes.ctype(s_dtype)
                r_ctype = dtypes.ctype(r_dtype)
            %>
            INLINE WITHIN_KERNEL ${s_ctype} ${prefix}0(
                const int idx_x,
                const ${s_ctype} psi,
                ${r_ctype} t)
            {
                return ${mul_cc}(
                    COMPLEX_CTR(${s_ctype})(
                        -${gamma},
                        -(${U} * (${norm}(psi) - ${correction}))),
                    psi
                );
            }
            """,
            render_kwds=dict(
                s_dtype=state_dtype,
                U=U,
                gamma=gamma,
                mul_cc=functions.mul(state_dtype, state_dtype),
                norm=functions.norm(state_dtype),
                correction=1. / dx if wigner else 0
                )),
        state_dtype, components=1)

示例#11

0

显示文件

文件： wigner.py 项目： fjarri-attic/early-universe

def nonlinear_no_potential(dtype, U, nu):
    c_dtype = dtype
    c_ctype = dtypes.ctype(c_dtype)
    s_dtype = dtypes.real_for(dtype)
    s_ctype = dtypes.ctype(s_dtype)

    return Module.create(
        """
        %for comp in (0, 1):
        INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}(
            ${c_ctype} psi0, ${c_ctype} psi1, ${s_ctype} t)
        {
            return (
                ${mul}(psi${comp}, (
                    ${dtypes.c_constant(U[comp, 0])} * ${norm}(psi0) +
                    ${dtypes.c_constant(U[comp, 1])} * ${norm}(psi1)
                    ))
                - ${mul}(psi${1 - comp}, ${nu})
                );
        }
        %endfor
        """,
        render_kwds=dict(
            mul=functions.mul(c_dtype, s_dtype),
            norm=functions.norm(c_dtype),
            U=U,
            nu=dtypes.c_constant(nu, s_dtype),
            c_ctype=c_ctype,
            s_ctype=s_ctype))

示例#12

0

显示文件

    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        for_reduction = Type(numpy.float64, alpha.shape)

        meter_trf = Transformation([
            Parameter('output', Annotation(for_reduction, 'o')),
            Parameter('alpha', Annotation(alpha, 'i')),
            Parameter('beta', Annotation(beta, 'i')),
            ],
            """
                ${alpha.ctype} alpha = ${alpha.load_same};
                ${beta.ctype} beta = ${beta.load_same};
                ${alpha.ctype} t = ${mul_cc}(alpha, beta);
                ${output.store_same}(t.x - ${ordering});
                """,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                ordering=ordering(self._representation),
                ))

        reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,))
        reduction.parameter.input.connect(
            meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta)

        plan.computation_call(reduction, output, alpha, beta)

        return plan

示例#13

0

显示文件

文件： integrator.py 项目： fjarri-attic/vienna_simulation

def get_nonlinear_wrapper(components, c_dtype, nonlinear_module, dt):
    s_dtype = dtypes.real_for(c_dtype)
    return Module.create(
        """
        %for comp in range(components):
        INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}(
            %for pcomp in range(components):
            ${c_ctype} psi${pcomp},
            %endfor
            ${s_ctype} V, ${s_ctype} t)
        {
            ${c_ctype} nonlinear = ${nonlinear}${comp}(
                %for pcomp in range(components):
                psi${pcomp},
                %endfor
                V, t);
            return ${mul}(
                COMPLEX_CTR(${c_ctype})(0, -${dt}),
                nonlinear);
        }
        %endfor
        """,
        render_kwds=dict(
            components=components,
            c_ctype=dtypes.ctype(c_dtype),
            s_ctype=dtypes.ctype(s_dtype),
            mul=functions.mul(c_dtype, c_dtype),
            dt=dtypes.c_constant(dt, s_dtype),
            nonlinear=nonlinear_module))

示例#14

0

显示文件

def tr_scale(arr, coeff_t):
    return Transformation(
        [
            Parameter('o1', Annotation(arr, 'o')),
            Parameter('i1', Annotation(arr, 'i')),
            Parameter('s1', Annotation(coeff_t))
        ],
        "${o1.store_same}(${mul}(${i1.load_same}, ${s1}));",
        render_kwds=dict(
            mul=functions.mul(arr.dtype, coeff_t, out_dtype=arr.dtype)))

示例#15

0

显示文件

文件： dummy.py 项目： ringw/reikna

def tr_scale(arr, coeff_t):
    return Transformation(
        [
            Parameter("o1", Annotation(arr, "o")),
            Parameter("i1", Annotation(arr, "i")),
            Parameter("s1", Annotation(coeff_t)),
        ],
        "${o1.store_same}(${mul}(${i1.load_same}, ${s1}));",
        render_kwds=dict(mul=functions.mul(arr.dtype, coeff_t, out_dtype=arr.dtype)),
    )

示例#16

0

显示文件

文件： fft.py 项目： xbee/nufhe

def fft512(use_constant_memory=False):
    module = Module(TEMPLATE.get_def('fft512'),
                    render_kwds=dict(
                        elem_ctype=dtypes.ctype(numpy.complex128),
                        temp_ctype=dtypes.ctype(numpy.float64),
                        cdata_ctype=dtypes.ctype(numpy.complex128),
                        polar_unit=functions.polar_unit(numpy.float64),
                        mul=functions.mul(numpy.complex128, numpy.complex128),
                        use_constant_memory=use_constant_memory,
                    ))
    return FFT512(module, use_constant_memory)

示例#17

0

显示文件

文件： transformations.py 项目： fjarri/reikna

def mul_param(arr_t, param_dtype):
    """
    Returns a scaling transformation with a dynamic parameter (1 output, 1 input, 1 scalar):
    ``output = input * param``.
    """
    return Transformation(
        [Parameter('output', Annotation(arr_t, 'o')),
        Parameter('input', Annotation(arr_t, 'i')),
        Parameter('param', Annotation(param_dtype))],
        "${output.store_same}(${mul}(${input.load_same}, ${param}));",
        render_kwds=dict(mul=functions.mul(arr_t.dtype, param_dtype, out_dtype=arr_t.dtype)))

示例#18

0

显示文件

文件： fft.py 项目： mgolub2/reikna

def get_common_kwds(dtype, device_params):
    return dict(
        dtype=dtype,
        min_mem_coalesce_width=device_params.min_mem_coalesce_width[dtype.itemsize],
        local_mem_banks=device_params.local_mem_banks,
        get_padding=get_padding,
        wrap_const=lambda x: dtypes.c_constant(x, dtypes.real_for(dtype)),
        min_blocks=helpers.min_blocks,
        mul=functions.mul(dtype, dtype),
        polar_unit=functions.polar_unit(dtypes.real_for(dtype)),
        cdivs=functions.div(dtype, numpy.uint32, out_dtype=dtype))

示例#19

0

显示文件

文件： fft.py 项目： jakirkham/reikna

def get_common_kwds(dtype, device_params):
    return dict(
        dtype=dtype,
        min_mem_coalesce_width=device_params.min_mem_coalesce_width[dtype.itemsize],
        local_mem_banks=device_params.local_mem_banks,
        get_padding=get_padding,
        wrap_const=lambda x: dtypes.c_constant(x, dtypes.real_for(dtype)),
        min_blocks=helpers.min_blocks,
        mul=functions.mul(dtype, dtype),
        polar_unit=functions.polar_unit(dtypes.real_for(dtype)),
        cdivs=functions.div(dtype, numpy.uint32, out_dtype=dtype))

示例#20

0

显示文件

文件： transformations.py 项目： xexo7C8/reikna

def mul_param(arr_t, param_dtype):
    """
    Returns a scaling transformation with a dynamic parameter (1 output, 1 input, 1 scalar):
    ``output = input * param``.
    """
    return Transformation(
        [Parameter('output', Annotation(arr_t, 'o')),
        Parameter('input', Annotation(arr_t, 'i')),
        Parameter('param', Annotation(param_dtype))],
        "${output.store_same}(${mul}(${input.load_same}, ${param}));",
        render_kwds=dict(mul=functions.mul(arr_t.dtype, param_dtype, out_dtype=arr_t.dtype)))

示例#21

0

显示文件

def get_multiply_trf(arr):
    return Transformation(
        [
            Parameter('output', Annotation(arr, 'o')),
            Parameter('input1', Annotation(arr, 'i')),
            Parameter('input2', Annotation(arr, 'i'))
        ],
        "${output.store_same}(${mul}(${input1.load_same}, ${input2.load_same}));",
        connectors=['output', 'input1'],
        render_kwds=dict(
            mul=functions.mul(arr.dtype, arr.dtype, out_dtype=arr.dtype)))

示例#22

0

显示文件

文件： matrixmul.py 项目： xexo7C8/reikna

    def _build_plan(self, plan_factory, device_params, output, matrix_a,
                    matrix_b):
        bwo = self._block_width_override

        if bwo is not None:
            block_widths = [bwo]
        else:
            nbanks = device_params.local_mem_banks
            block_widths = [2**n for n in range(helpers.log2(nbanks), -1, -1)]

        a_batch = helpers.product(matrix_a.shape[:-2])
        b_batch = helpers.product(matrix_b.shape[:-2])
        batch = max(a_batch, b_batch)

        for block_width in block_widths:

            plan = plan_factory()

            if block_width**2 > device_params.max_work_group_size:
                continue

            num_steps = helpers.min_blocks(self._convolution_size, block_width)
            a_blocks = helpers.min_blocks(self._a_outer_size, block_width)
            b_blocks = helpers.min_blocks(self._b_outer_size, block_width)

            render_kwds = dict(batched_a=(a_batch != 1),
                               batched_b=(b_batch != 1),
                               transposed_a=self._transposed_a,
                               transposed_b=self._transposed_b,
                               num_steps=num_steps,
                               a_slices=(len(matrix_a.shape) - 2, 1, 1),
                               b_slices=(len(matrix_b.shape) - 2, 1, 1),
                               output_slices=(len(output.shape) - 2, 1, 1),
                               block_width=block_width,
                               mul=functions.mul(matrix_a.dtype,
                                                 matrix_b.dtype,
                                                 out_dtype=output.dtype))

            try:
                plan.kernel_call(TEMPLATE.get_def('matrixmul'),
                                 [output, matrix_a, matrix_b],
                                 kernel_name="kernel_matrixmul",
                                 global_size=(batch, a_blocks * block_width,
                                              b_blocks * block_width),
                                 local_size=(1, block_width, block_width),
                                 render_kwds=render_kwds)
            except OutOfResourcesError:
                continue

            return plan

        raise ValueError(
            "Could not find suitable call parameters for the kernel")

示例#23

0

显示文件

文件： matrixmul.py 项目： fjarri/reikna

    def _build_plan(self, plan_factory, device_params, output, matrix_a, matrix_b):
        bwo = self._block_width_override

        if bwo is not None:
            block_widths = [bwo]
        else:
            nbanks = device_params.local_mem_banks
            block_widths = [2 ** n for n in range(helpers.log2(nbanks), -1, -1)]

        a_batch = helpers.product(matrix_a.shape[:-2])
        b_batch = helpers.product(matrix_b.shape[:-2])
        batch = max(a_batch, b_batch)

        for block_width in block_widths:

            plan = plan_factory()

            if block_width ** 2 > device_params.max_work_group_size:
                continue

            num_steps = helpers.min_blocks(self._convolution_size, block_width)
            a_blocks = helpers.min_blocks(self._a_outer_size, block_width)
            b_blocks = helpers.min_blocks(self._b_outer_size, block_width)

            render_kwds = dict(
                batched_a=(a_batch != 1),
                batched_b=(b_batch != 1),
                transposed_a=self._transposed_a,
                transposed_b=self._transposed_b,
                num_steps=num_steps,
                a_slices=(len(matrix_a.shape) - 2, 1, 1),
                b_slices=(len(matrix_b.shape) - 2, 1, 1),
                output_slices=(len(output.shape) - 2, 1, 1),
                block_width=block_width,
                mul=functions.mul(matrix_a.dtype, matrix_b.dtype, out_dtype=output.dtype))

            try:
                plan.kernel_call(
                    TEMPLATE.get_def('matrixmul'),
                    [output, matrix_a, matrix_b],
                    kernel_name="kernel_matrixmul",
                    global_size=(
                        batch,
                        a_blocks * block_width,
                        b_blocks * block_width),
                    local_size=(1, block_width, block_width),
                    render_kwds=render_kwds)
            except OutOfResourcesError:
                continue

            return plan

        raise ValueError("Could not find suitable call parameters for the kernel")

示例#24

0

显示文件

文件： test_transformation.py 项目： ringw/reikna

def tr_1_to_2(arr):
    return Transformation(
        [Parameter('o1', Annotation(arr, 'o')),
        Parameter('o2', Annotation(arr, 'o')),
        Parameter('i1', Annotation(arr, 'i'))],
        """
        ${o1.ctype} t = ${mul}(${i1.load_same}, 0.5);
        ${o1.store_same}(t);
        ${o2.store_same}(t);
        """,
        render_kwds=dict(
            mul=functions.mul(arr.dtype, numpy.float32)))

示例#25

0

显示文件

    def __init__(self,
                 state_arr,
                 dt,
                 box=None,
                 kinetic_coeff=1,
                 nonlinear_module=None):
        scalar_dtype = dtypes.real_for(state_arr.dtype)
        Computation.__init__(self, [
            Parameter('output', Annotation(state_arr, 'o')),
            Parameter('input', Annotation(state_arr, 'i')),
            Parameter('t', Annotation(scalar_dtype))
        ])

        self._box = box
        self._kinetic_coeff = kinetic_coeff
        self._nonlinear_module = nonlinear_module
        self._components = state_arr.shape[0]
        self._ensembles = state_arr.shape[1]
        self._grid_shape = state_arr.shape[2:]

        ksquared = get_ksquared(self._grid_shape, self._box)
        self._kprop = numpy.exp(
            ksquared * (-1j * kinetic_coeff * dt / 2)).astype(state_arr.dtype)
        self._kprop_trf = Transformation(
            [
                Parameter('output', Annotation(state_arr, 'o')),
                Parameter('input', Annotation(state_arr, 'i')),
                Parameter('kprop', Annotation(self._kprop, 'i'))
            ],
            """
            ${kprop.ctype} kprop_coeff = ${kprop.load_idx}(${', '.join(idxs[2:])});
            ${output.store_same}(${mul}(${input.load_same}, kprop_coeff));
            """,
            render_kwds=dict(
                mul=functions.mul(state_arr.dtype, self._kprop.dtype)))

        self._fft = FFT(state_arr, axes=range(2, len(state_arr.shape)))
        self._fft_with_kprop = FFT(state_arr,
                                   axes=range(2, len(state_arr.shape)))
        self._fft_with_kprop.parameter.output.connect(
            self._kprop_trf,
            self._kprop_trf.input,
            output_prime=self._kprop_trf.output,
            kprop=self._kprop_trf.kprop)

        nonlinear_wrapper = get_nonlinear_wrapper(state_arr.dtype,
                                                  nonlinear_module, dt)
        self._N1 = get_nonlinear1(state_arr, scalar_dtype, nonlinear_wrapper)
        self._N2 = get_nonlinear2(state_arr, scalar_dtype, nonlinear_wrapper,
                                  dt)
        self._N3 = get_nonlinear3(state_arr, scalar_dtype, nonlinear_wrapper,
                                  dt)

示例#26

0

显示文件

文件： test_soliton.py 项目： fjarri-attic/vienna_simulation

def get_nonlinear(dtype, interaction, tunneling):
    r"""
    Nonlinear module

    .. math::

        N(\psi_1, ... \psi_C)
        = \sum_{n=1}^{C} U_{jn} |\psi_n|^2 \psi_j
          - \nu_j psi_{m_j}

    ``interaction``: a symmetrical ``components x components`` array with interaction strengths.
    ``tunneling``: a list of (other_comp, coeff) pairs of tunnelling strengths.
    """

    c_dtype = dtype
    c_ctype = dtypes.ctype(c_dtype)
    s_dtype = dtypes.real_for(dtype)
    s_ctype = dtypes.ctype(s_dtype)

    return Module.create(
        """
        %for comp in range(components):
        INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}(
            %for pcomp in range(components):
            ${c_ctype} psi${pcomp},
            %endfor
            ${s_ctype} V, ${s_ctype} t)
        {
            return (
                ${mul}(psi${comp}, (
                    %for other_comp in range(components):
                    + ${dtypes.c_constant(interaction[comp, other_comp], s_dtype)} *
                        ${norm}(psi${other_comp})
                    %endfor
                    + V
                    ))
                - ${mul}(
                    psi${tunneling[comp][0]},
                    ${dtypes.c_constant(tunneling[comp][1], s_dtype)})
                );
        }
        %endfor
        """,
        render_kwds=dict(
            components=interaction.shape[0],
            mul=functions.mul(c_dtype, s_dtype),
            norm=functions.norm(c_dtype),
            interaction=interaction,
            tunneling=tunneling,
            s_dtype=s_dtype,
            c_ctype=c_ctype,
            s_ctype=s_ctype))

示例#27

0

显示文件

文件： demo_specialized_fft.py 项目： fjarri/reikna

def get_multiply(output):
    return Transformation(
        [
            Parameter('output', Annotation(output, 'o')),
            Parameter('a', Annotation(output, 'i')),
            Parameter('b', Annotation(Type(output.dtype, (output.shape[-1],)), 'i'))
        ],
        """
        ${output.store_same}(${mul}(${a.load_same}, ${b.load_idx}(${idxs[-1]})));
        """,
        connectors=['output', 'a'],
        render_kwds=dict(mul=functions.mul(output.dtype, output.dtype))
        )

示例#28

0

显示文件

文件： transformations.py 项目： xexo7C8/reikna

def mul_const(arr_t, param):
    """
    Returns a scaling transformation with a fixed parameter (1 output, 1 input):
    ``output = input * param``.
    """
    param_dtype = dtypes.detect_type(param)
    return Transformation(
        [Parameter('output', Annotation(arr_t, 'o')),
        Parameter('input', Annotation(arr_t, 'i'))],
        "${output.store_same}(${mul}(${input.load_same}, ${param}));",
        render_kwds=dict(
            mul=functions.mul(arr_t.dtype, param_dtype, out_dtype=arr_t.dtype),
            param=dtypes.c_constant(param, dtype=param_dtype)))

示例#29

0

显示文件

文件： transformations.py 项目： fjarri/reikna

def mul_const(arr_t, param):
    """
    Returns a scaling transformation with a fixed parameter (1 output, 1 input):
    ``output = input * param``.
    """
    param_dtype = dtypes.detect_type(param)
    return Transformation(
        [Parameter('output', Annotation(arr_t, 'o')),
        Parameter('input', Annotation(arr_t, 'i'))],
        "${output.store_same}(${mul}(${input.load_same}, ${param}));",
        render_kwds=dict(
            mul=functions.mul(arr_t.dtype, param_dtype, out_dtype=arr_t.dtype),
            param=dtypes.c_constant(param, dtype=param_dtype)))

示例#30

0

显示文件

文件： test_transformation.py 项目： ringw/reikna

def tr_2_to_1(arr, scalar):
    return Transformation(
        [Parameter('o1', Annotation(arr, 'o')),
        Parameter('i1', Annotation(arr, 'i')),
        Parameter('i2', Annotation(arr, 'i')),
        Parameter('s1', Annotation(scalar))],
        """
        ${o1.ctype} t = ${mul}(${cast}(${s1}), ${i1.load_same});
        ${o1.store_same}(t + ${i2.load_same});
        """,
        render_kwds= dict(
            mul=functions.mul(arr.dtype, arr.dtype),
            cast=functions.cast(arr.dtype, scalar.dtype)))

示例#31

0

显示文件

文件： demo_specialized_fft.py 项目： xexo7C8/reikna

def get_multiply(output):
    return Transformation([
        Parameter('output', Annotation(output, 'o')),
        Parameter('a', Annotation(output, 'i')),
        Parameter('b', Annotation(Type(output.dtype,
                                       (output.shape[-1], )), 'i'))
    ],
                          """
        ${output.store_same}(${mul}(${a.load_same}, ${b.load_idx}(${idxs[-1]})));
        """,
                          connectors=['output', 'a'],
                          render_kwds=dict(
                              mul=functions.mul(output.dtype, output.dtype)))

示例#32

0

显示文件

文件： integrator.py 项目： fjarri-attic/vienna_simulation

    def __init__(self, state_arr, dt, box=None, kinetic_coeff=1, nonlinear_module=None):
        scalar_dtype = dtypes.real_for(state_arr.dtype)
        potential_arr = Type(scalar_dtype, shape=state_arr.shape[2:])

        Computation.__init__(self, [
            Parameter('output', Annotation(state_arr, 'o')),
            Parameter('input', Annotation(state_arr, 'i')),
            Parameter('potential1', Annotation(potential_arr, 'i')),
            Parameter('potential2', Annotation(potential_arr, 'i')),
            Parameter('t_potential1', Annotation(scalar_dtype)),
            Parameter('t_potential2', Annotation(scalar_dtype)),
            Parameter('t', Annotation(scalar_dtype))])

        self._box = box
        self._kinetic_coeff = kinetic_coeff
        self._nonlinear_module = nonlinear_module
        self._components = state_arr.shape[0]
        self._ensembles = state_arr.shape[1]
        self._grid_shape = state_arr.shape[2:]

        ksquared = get_ksquared(self._grid_shape, self._box)
        self._kprop = numpy.exp(ksquared * (-1j * kinetic_coeff * dt / 2)).astype(state_arr.dtype)
        self._kprop_trf = Transformation(
            [
                Parameter('output', Annotation(state_arr, 'o')),
                Parameter('input', Annotation(state_arr, 'i')),
                Parameter('kprop', Annotation(self._kprop, 'i'))],
            """
            ${kprop.ctype} kprop_coeff = ${kprop.load_idx}(${', '.join(idxs[2:])});
            ${output.store_same}(${mul}(${input.load_same}, kprop_coeff));
            """,
            render_kwds=dict(mul=functions.mul(state_arr.dtype, self._kprop.dtype)))

        self._fft = FFT(state_arr, axes=range(2, len(state_arr.shape)))
        self._fft_with_kprop = FFT(state_arr, axes=range(2, len(state_arr.shape)))
        self._fft_with_kprop.parameter.output.connect(
            self._kprop_trf, self._kprop_trf.input,
            output_prime=self._kprop_trf.output,
            kprop=self._kprop_trf.kprop)

        nonlinear_wrapper = get_nonlinear_wrapper(
            state_arr.shape[0], state_arr.dtype, nonlinear_module, dt)
        self._N1 = get_nonlinear1(state_arr, potential_arr, scalar_dtype, nonlinear_wrapper)
        self._N2 = get_nonlinear2(state_arr, potential_arr, scalar_dtype, nonlinear_wrapper, dt)
        self._N3 = get_nonlinear3(state_arr, potential_arr, scalar_dtype, nonlinear_wrapper, dt)
        self._potential_interpolator = get_potential_interpolator(potential_arr, dt)

示例#33

0

显示文件

    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_click_order))

        meter_trf = Transformation([
            Parameter('output', Annotation(for_reduction, 'o')),
            Parameter('alpha', Annotation(alpha, 'i')),
            Parameter('beta', Annotation(beta, 'i')),
            ],
            """
                VSIZE_T sample_idx = ${idxs[0]};
                VSIZE_T order = ${idxs[1]} + 1;

                ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0);
                for (VSIZE_T i = 0; i < ${modes}; i++) {
                    ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i);
                    ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i);
                    ${alpha.ctype} t = ${mul_cc}(alpha, beta);
                    ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y));

                    if (i >= order) {
                        result = ${mul_cc}(result, np);
                    }
                    else {
                        ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y);
                        result = ${mul_cc}(result, cp);
                    }
                }

                ${output.store_same}(result.x);
                """,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                exp_c=functions.exp(alpha.dtype),
                modes=self._system.modes,
                ))

        reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,))
        reduction.parameter.input.connect(
            meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta)

        plan.computation_call(reduction, output, alpha, beta)

        return plan

示例#34

0

显示文件

def get_nonlinear_wrapper(c_dtype, nonlinear_module, dt):
    s_dtype = dtypes.real_for(c_dtype)
    return Module.create("""
        %for comp in (0, 1):
        INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}(
            ${c_ctype} psi0, ${c_ctype} psi1, ${s_ctype} t)
        {
            ${c_ctype} nonlinear = ${nonlinear}${comp}(psi0, psi1, t);
            return ${mul}(
                COMPLEX_CTR(${c_ctype})(0, -${dt}),
                nonlinear);
        }
        %endfor
        """,
                         render_kwds=dict(c_ctype=dtypes.ctype(c_dtype),
                                          s_ctype=dtypes.ctype(s_dtype),
                                          mul=functions.mul(c_dtype, c_dtype),
                                          dt=dtypes.c_constant(dt, s_dtype),
                                          nonlinear=nonlinear_module))

示例#35

0

显示文件

文件： test_functions.py 项目： SyamGadde/reikna

def test_multiarg_mul(thr, out_code, in_codes):
    """
    Checks multi-argument mul() with a variety of data types.
    """

    out_dtype, in_dtypes = generate_dtypes(out_code, in_codes)

    def reference_mul(*args):
        res = product(args)
        if not dtypes.is_complex(out_dtype) and dtypes.is_complex(res.dtype):
            res = res.real
        return res.astype(out_dtype)

    # Temporarily catching imaginary part truncation warnings
    with catch_warnings():
        filterwarnings("ignore", "", numpy.ComplexWarning)
        mul = functions.mul(*in_dtypes, out_dtype=out_dtype)

    check_func(thr, mul, reference_mul, out_dtype, in_dtypes)

示例#36

0

显示文件

文件： test_functions.py 项目： ringw/reikna

def test_multiarg_mul(thr, out_code, in_codes):
    """
    Checks multi-argument mul() with a variety of data types.
    """

    out_dtype, in_dtypes = generate_dtypes(out_code, in_codes)

    def reference_mul(*args):
        res = product(args)
        if not dtypes.is_complex(out_dtype) and dtypes.is_complex(res.dtype):
            res = res.real
        return res.astype(out_dtype)

    # Temporarily catching imaginary part truncation warnings
    with catch_warnings():
        filterwarnings("ignore", "", numpy.ComplexWarning)
        mul = functions.mul(*in_dtypes, out_dtype=out_dtype)

    check_func(thr, mul, reference_mul, out_dtype, in_dtypes)

示例#37

0

显示文件

文件： soliton.py 项目： fjarri/reikna-integrator

def get_diffusion(state_dtype, gamma):
    return Diffusion(
        Module.create(
            """
            <%
                r_dtype = dtypes.real_for(s_dtype)
                s_ctype = dtypes.ctype(s_dtype)
                r_ctype = dtypes.ctype(r_dtype)
            %>
            INLINE WITHIN_KERNEL ${s_ctype} ${prefix}0_0(
                const int idx_x,
                const ${s_ctype} psi,
                ${r_ctype} t)
            {
                return COMPLEX_CTR(${s_ctype})(${numpy.sqrt(gamma)}, 0);
            }
            """,
            render_kwds=dict(
                mul_cr=functions.mul(state_dtype, dtypes.real_for(state_dtype)),
                s_dtype=state_dtype,
                gamma=gamma)),
        state_dtype, components=1, noise_sources=1)

示例#38

0

显示文件

文件： dummy.py 项目： ringw/reikna

    def _build_plan(self, plan_factory, device_params, C, D, coeff1, coeff2):
        plan = plan_factory()
        nested = Dummy(C, D, coeff1, same_A_B=True)

        C_temp = plan.temp_array_like(C)
        D_temp = plan.temp_array_like(D)

        # Testing a computation call which uses the same argument for two parameters.
        plan.computation_call(nested, C_temp, D, C, C, coeff1)

        arr_dtype = C.dtype
        coeff_dtype = coeff2.dtype

        mul = functions.mul(arr_dtype, coeff_dtype)
        div = functions.div(arr_dtype, coeff_dtype)

        template = template_from(
            """
        <%def name="dummy(kernel_declaration, CC, C, D, coeff)">
        ${kernel_declaration}
        {
            VIRTUAL_SKIP_THREADS;
            VSIZE_T idx0 = virtual_global_id(0);
            VSIZE_T idx1 = virtual_global_id(1);

            ${CC.store_idx}(idx0, idx1,
                ${C.load_idx}(idx0, idx1) +
                ${mul}(${D.load_idx}(idx0, idx1), ${coeff}));
        }
        </%def>
        """
        )

        # Testing a kernel call which uses the same argument for two parameters.
        plan.kernel_call(
            template.get_def("dummy"), [C, C_temp, C_temp, coeff2], global_size=C.shape, render_kwds=dict(mul=mul)
        )

        return plan

示例#39

0

显示文件

    def _build_plan(self, plan_factory, device_params, C, D, coeff1, coeff2):
        plan = plan_factory()
        nested = Dummy(C, D, coeff1, same_A_B=True)

        C_temp = plan.temp_array_like(C)
        D_temp = plan.temp_array_like(D)

        # Testing a computation call which uses the same argument for two parameters.
        plan.computation_call(nested, C_temp, D, C, C, coeff1)

        arr_dtype = C.dtype
        coeff_dtype = coeff2.dtype

        mul = functions.mul(arr_dtype, coeff_dtype)
        div = functions.div(arr_dtype, coeff_dtype)

        template = template_from("""
        <%def name="dummy(kernel_declaration, CC, C, D, coeff)">
        ${kernel_declaration}
        {
            VIRTUAL_SKIP_THREADS;
            VSIZE_T idx0 = virtual_global_id(0);
            VSIZE_T idx1 = virtual_global_id(1);

            ${CC.store_idx}(idx0, idx1,
                ${C.load_idx}(idx0, idx1) +
                ${mul}(${D.load_idx}(idx0, idx1), ${coeff}));
        }
        </%def>
        """)

        # Testing a kernel call which uses the same argument for two parameters.
        plan.kernel_call(template.get_def('dummy'),
                         [C, C_temp, C_temp, coeff2],
                         global_size=C.shape,
                         render_kwds=dict(mul=mul))

        return plan

示例#40

0

显示文件

文件： demo_specialized_fft.py 项目： xexo7C8/reikna

    def _build_plan(self, plan_factory, device_params, output, input_):

        plan = plan_factory()

        N = (input_.shape[-1] - 1) * 2

        WNmk = numpy.exp(-2j * numpy.pi * numpy.arange(N // 2) / N)
        A = 0.5 * (1 - 1j * WNmk)
        B = 0.5 * (1 + 1j * WNmk)

        A_arr = plan.persistent_array(A.conj())
        B_arr = plan.persistent_array(B.conj())

        cfft_arr = Type(input_.dtype, input_.shape[:-1] + (N // 2, ))
        cfft = FFT(cfft_arr, axes=(len(input_.shape) - 1, ))

        prepare_output = prepare_irfft_output(cfft.parameter.output)

        cfft.parameter.output.connect(prepare_output,
                                      prepare_output.input,
                                      real_output=prepare_output.output)

        temp = plan.temp_array_like(cfft.parameter.input)

        batch_size = helpers.product(output.shape[:-1])

        plan.kernel_call(TEMPLATE.get_def('prepare_irfft_input'),
                         [temp, input_, A_arr, B_arr],
                         global_size=(batch_size, N // 2),
                         render_kwds=dict(slices=(len(input_.shape) - 1, 1),
                                          N=N,
                                          mul=functions.mul(
                                              input_.dtype, input_.dtype),
                                          conj=functions.conj(input_.dtype)))

        plan.computation_call(cfft, output, temp, inverse=True)

        return plan

示例#41

0

显示文件

文件： demo_specialized_fft.py 项目： fjarri/reikna

    def _build_plan(self, plan_factory, device_params, output, input_):

        plan = plan_factory()

        N = (input_.shape[-1] - 1) * 2

        WNmk = numpy.exp(-2j * numpy.pi * numpy.arange(N//2) / N)
        A = 0.5 * (1 - 1j * WNmk)
        B = 0.5 * (1 + 1j * WNmk)

        A_arr = plan.persistent_array(A.conj())
        B_arr = plan.persistent_array(B.conj())

        cfft_arr = Type(input_.dtype, input_.shape[:-1] + (N // 2,))
        cfft = FFT(cfft_arr, axes=(len(input_.shape) - 1,))

        prepare_output = prepare_irfft_output(cfft.parameter.output)

        cfft.parameter.output.connect(
            prepare_output, prepare_output.input, real_output=prepare_output.output)

        temp = plan.temp_array_like(cfft.parameter.input)

        batch_size = helpers.product(output.shape[:-1])

        plan.kernel_call(
            TEMPLATE.get_def('prepare_irfft_input'),
                [temp, input_, A_arr, B_arr],
                global_size=(batch_size, N // 2),
                render_kwds=dict(
                    slices=(len(input_.shape) - 1, 1),
                    N=N,
                    mul=functions.mul(input_.dtype, input_.dtype),
                    conj=functions.conj(input_.dtype)))

        plan.computation_call(cfft, output, temp, inverse=True)

        return plan

示例#42

0

显示文件

    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_moment))

        meter_trf = Transformation([
            Parameter('output', Annotation(for_reduction, 'o')),
            Parameter('alpha', Annotation(alpha, 'i')),
            Parameter('beta', Annotation(beta, 'i')),
            ],
            """
                VSIZE_T sample_idx = ${idxs[0]};
                VSIZE_T order = ${idxs[1]};

                ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0);
                for (VSIZE_T i = 0; i <= order; i++) {
                    ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i);
                    ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i);
                    ${alpha.ctype} t = ${mul_cc}(alpha, beta);
                    t.x -= ${ordering};
                    result = ${mul_cc}(result, t);
                }
                ${output.store_same}(result.x);
                """,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                ordering=ordering(self._representation),
                ))

        reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,))
        reduction.parameter.input.connect(
            meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta)

        plan.computation_call(reduction, output, alpha, beta)

        return plan

示例#43

0

显示文件

    def _build_plan(self, plan_factory, device_params, C, D, A, B, coeff):
        plan = plan_factory()

        arr_dtype = C.dtype
        coeff_dtype = coeff.dtype

        mul = functions.mul(arr_dtype, coeff_dtype)
        div = functions.div(arr_dtype, coeff_dtype)

        template = template_from("""
        <%def name="dummy(kernel_declaration, C, D, A, B, coeff)">
        ${kernel_declaration}
        {
            VIRTUAL_SKIP_THREADS;
            VSIZE_T idx0 = virtual_global_id(0);
            VSIZE_T idx1 = virtual_global_id(1);

            ${A.ctype} a = ${A.load_idx}(idx0, idx1);
            ${C.ctype} c = ${mul}(a, ${coeff});
            ${C.store_idx}(idx1, idx0, c);

            %if same_A_B:
                ${B.ctype} b = ${B.load_idx}(idx0, idx1);
                ${D.ctype} d = ${div}(b, ${coeff});
                ${D.store_idx}(idx0, idx1, d);
            %else:
            if (idx1 == 0)
            {
                ${B.ctype} b = ${B.load_idx}(idx0);
                ${D.ctype} d = ${div}(b, ${coeff});
                ${D.store_idx}(idx0, d);
            }
            %endif
        }
        </%def>

        <%def name="dummy2(kernel_declaration, CC, DD, C, D, pers_arr, const_coeff)">
        ${kernel_declaration}
        {
            VIRTUAL_SKIP_THREADS;
            VSIZE_T idx0 = virtual_global_id(0);
            VSIZE_T idx1 = virtual_global_id(1);

            ${CC.store_idx}(idx0, idx1, ${C.load_idx}(idx0, idx1));

            %if same_A_B:
                ${DD.store_idx}(
                    idx0, idx1,
                    ${mul}(${D.load_idx}(idx0, idx1), ${const_coeff}) +
                        ${pers_arr.load_idx}(idx0, idx1));
            %else:
            if (idx1 == 0)
            {
                ${DD.store_idx}(
                    idx0,
                    ${mul}(${D.load_idx}(idx0), ${const_coeff}) +
                        ${pers_arr.load_idx}(idx0));
            }
            %endif
        }
        </%def>
        """)

        block_size = 8

        C_temp = plan.temp_array_like(C)
        D_temp = plan.temp_array_like(D)
        arr = plan.persistent_array(self._persistent_array)

        plan.kernel_call(template.get_def('dummy'),
                         [C_temp, D_temp, A, B, coeff],
                         global_size=A.shape,
                         local_size=(block_size, block_size),
                         render_kwds=dict(mul=mul,
                                          div=div,
                                          same_A_B=self._same_A_B))

        plan.kernel_call(template.get_def('dummy2'), [
            C, D, C_temp, D_temp,
            (self._persistent_array if self._test_kernel_adhoc_array else arr),
            (10 if self._test_untyped_scalar else numpy.float32(10))
        ],
                         global_size=A.shape,
                         local_size=(block_size, block_size),
                         render_kwds=dict(mul=mul, same_A_B=self._same_A_B))

        return plan

示例#44

0

显示文件

文件： dummy.py 项目： ringw/reikna

    def _build_plan(self, plan_factory, device_params, C, D, A, B, coeff):
        plan = plan_factory()

        arr_dtype = C.dtype
        coeff_dtype = coeff.dtype

        mul = functions.mul(arr_dtype, coeff_dtype)
        div = functions.div(arr_dtype, coeff_dtype)

        template = template_from(
            """
        <%def name="dummy(kernel_declaration, C, D, A, B, coeff)">
        ${kernel_declaration}
        {
            VIRTUAL_SKIP_THREADS;
            VSIZE_T idx0 = virtual_global_id(0);
            VSIZE_T idx1 = virtual_global_id(1);

            ${A.ctype} a = ${A.load_idx}(idx0, idx1);
            ${C.ctype} c = ${mul}(a, ${coeff});
            ${C.store_idx}(idx1, idx0, c);

            %if same_A_B:
                ${B.ctype} b = ${B.load_idx}(idx0, idx1);
                ${D.ctype} d = ${div}(b, ${coeff});
                ${D.store_idx}(idx0, idx1, d);
            %else:
            if (idx1 == 0)
            {
                ${B.ctype} b = ${B.load_idx}(idx0);
                ${D.ctype} d = ${div}(b, ${coeff});
                ${D.store_idx}(idx0, d);
            }
            %endif
        }
        </%def>

        <%def name="dummy2(kernel_declaration, CC, DD, C, D, pers_arr, const_coeff)">
        ${kernel_declaration}
        {
            VIRTUAL_SKIP_THREADS;
            VSIZE_T idx0 = virtual_global_id(0);
            VSIZE_T idx1 = virtual_global_id(1);

            ${CC.store_idx}(idx0, idx1, ${C.load_idx}(idx0, idx1));

            %if same_A_B:
                ${DD.store_idx}(
                    idx0, idx1,
                    ${mul}(${D.load_idx}(idx0, idx1), ${const_coeff}) +
                        ${pers_arr.load_idx}(idx0, idx1));
            %else:
            if (idx1 == 0)
            {
                ${DD.store_idx}(
                    idx0,
                    ${mul}(${D.load_idx}(idx0), ${const_coeff}) +
                        ${pers_arr.load_idx}(idx0));
            }
            %endif
        }
        </%def>
        """
        )

        block_size = 8

        C_temp = plan.temp_array_like(C)
        D_temp = plan.temp_array_like(D)
        arr = plan.persistent_array(self._persistent_array)

        plan.kernel_call(
            template.get_def("dummy"),
            [C_temp, D_temp, A, B, coeff],
            global_size=A.shape,
            local_size=(block_size, block_size),
            render_kwds=dict(mul=mul, div=div, same_A_B=self._same_A_B),
        )

        plan.kernel_call(
            template.get_def("dummy2"),
            [
                C,
                D,
                C_temp,
                D_temp,
                (self._persistent_array if self._test_kernel_adhoc_array else arr),
                (10 if self._test_untyped_scalar else numpy.float32(10)),
            ],
            global_size=A.shape,
            local_size=(block_size, block_size),
            render_kwds=dict(mul=mul, same_A_B=self._same_A_B),
        )

        return plan

示例#45

0

显示文件

文件： isaacudaimpl.py 项目： xialulee/WaveSyn

def get_procs(thr, N):
    fft = FFTFactory.create(thr, (N,), compile_=False)
    unimod_trans = Transformation(
        [Parameter('output', Annotation(Type(np.complex128, N), 'o')),
        Parameter('input', Annotation(Type(np.complex128, N), 'i'))],
        """
VSIZE_T idx = ${idxs[0]};
${input.ctype} val = ${input.load_same};
if (idx>${N}/2){
    val.x = 0.0;
    val.y = 0.0;
    ${output.store_same}(val);
}else
    ${output.store_same}(${polar_unit}(atan2(val.y, val.x)));
        """,
        render_kwds=dict(polar_unit=functions.polar_unit(dtype=np.float64), N=N)
    )
    fft.parameter.output.connect(unimod_trans, unimod_trans.input, uni=unimod_trans.output)
    fft_unimod = fft.compile(thr)
    
    mag_square = PureParallel(
        [Parameter('output', Annotation(Type(np.complex128, N), 'o')),
         Parameter('input', Annotation(Type(np.complex128, N), 'i'))],
        '''
VSIZE_T idx = ${idxs[0]};
${input.ctype} val = ${input.load_idx}(idx);  
val.x = val.x*val.x + val.y*val.y;
val.y = 0;
${output.store_idx}(idx, val);
        '''
    )
    mag_square = mag_square.compile(thr)
    
    apply_mask = PureParallel(
        [Parameter('output', Annotation(Type(np.complex128, N), 'o')),
         Parameter('origin', Annotation(Type(np.complex128, N), 'i')),
         Parameter('mask', Annotation(Type(np.double, N), 'i'))],
        '''
VSIZE_T idx = ${idxs[0]};
${output.store_idx}(idx, ${mul}(${origin.load_idx}(idx), ${mask.load_idx}(idx)));        
        ''',
        render_kwds=dict(mul=functions.mul(np.complex128, np.double))
    )
    apply_mask = apply_mask.compile(thr)
    
    combine_mag_phi = PureParallel(
        [Parameter('output', Annotation(Type(np.complex128, N), 'o')),
         Parameter('mag_square', Annotation(Type(np.complex128, N), 'i')),
         Parameter('phase', Annotation(Type(np.complex128, N), 'i'))],
        '''
VSIZE_T idx = ${idxs[0]};
double r = ${mag_square.load_idx}(idx).x;  
r = r<0.0 ? 0.0 : ${pow}(r, 0.5);
double2 v = ${phase.load_idx}(idx);
double angle = atan2(v.y, v.x);
${output.store_idx}(idx, ${polar}(r, angle));
        ''',
        render_kwds=dict(pow=functions.pow(np.double), polar=functions.polar(np.double))
    )
    combine_mag_phi = combine_mag_phi.compile(thr)
   
    return fft_unimod, mag_square, apply_mask, combine_mag_phi

示例#46

0

显示文件

    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        samples, modes = alpha.shape

        for_reduction = Type(alpha.dtype, (samples, self._max_total_clicks + 1))

        prepared_state = plan.temp_array_like(alpha)

        plan.kernel_call(
            TEMPLATE.get_def("compound_click_probability_prepare"),
            [prepared_state, alpha, beta],
            kernel_name="compound_click_probability_prepare",
            global_size=alpha.shape,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                exp_c=functions.exp(alpha.dtype),
                ))

        # Block size is limited by the amount of available local memory.
        # In some OpenCL implementations the number reported cannot actually be fully used
        # (because it's used by kernel arguments), so we're padding it a little.
        local_mem_size = device_params.local_mem_size
        max_elems = (local_mem_size - 256) // alpha.dtype.itemsize
        block_size = 2**helpers.log2(max_elems)

        # No reason to have block size larger than the number of modes
        block_size = min(block_size, helpers.bounding_power_of_2(modes))

        products_gsize = (samples, helpers.min_blocks(self._max_total_clicks + 1, block_size) * block_size)
        products = plan.temp_array_like(for_reduction)

        read_size = min(block_size, device_params.max_work_group_size)

        while read_size > 1:

            full_steps = modes // block_size
            remainder_size = modes % block_size

            try:
                plan.kernel_call(
                    TEMPLATE.get_def("compound_click_probability_aggregate"),
                    [products, prepared_state],
                    kernel_name="compound_click_probability_aggregate",
                    global_size=products_gsize,
                    local_size=(1, read_size,),
                    render_kwds=dict(
                        block_size=block_size,
                        read_size=read_size,
                        full_steps=full_steps,
                        remainder_size=remainder_size,
                        output_size=self._max_total_clicks + 1,
                        mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                        add_cc=functions.add(alpha.dtype, alpha.dtype),
                        polar_unit=functions.polar_unit(dtypes.real_for(alpha.dtype)),
                        modes=self._system.modes,
                        max_total_clicks=self._max_total_clicks,
                        ))

            except OutOfResourcesError:
                read_size //= 2

            break

        reduction = Reduce(for_reduction, predicate_sum(alpha.dtype), axes=(0,))

        temp = plan.temp_array_like(reduction.parameter.output)

        plan.computation_call(reduction, temp, products)

        fft = FFT(temp)
        real_trf = Transformation([
            Parameter('output', Annotation(output, 'o')),
            Parameter('input', Annotation(temp, 'i')),
            ],
            """
                ${input.ctype} val = ${input.load_same};
                ${output.store_same}(val.x);
                """)
        fft.parameter.output.connect(real_trf, real_trf.input, output_p=real_trf.output)

        plan.computation_call(fft, output, temp, True)

        return plan

示例#47

0

显示文件

文件： generate_gpu.py 项目： fjarri/squeezed-sim

    def _build_plan(self, plan_factory, device_params, alpha, beta, alpha_i,
                    beta_i, seed):
        plan = plan_factory()

        system = self._system
        representation = self._representation

        unitary = plan.persistent_array(self._system.unitary)

        needs_noise_matrix = representation != Representation.POSITIVE_P and system.needs_noise_matrix(
        )

        mmul = MatrixMul(alpha, unitary, transposed_b=True)

        if not needs_noise_matrix:

            # TODO: this could be sped up for repr != POSITIVE_P,
            # since in that case alpha == conj(beta), and we don't need to do two multuplications.

            mmul_beta = MatrixMul(beta, unitary, transposed_b=True)
            trf_conj = self._make_trf_conj()
            mmul_beta.parameter.matrix_b.connect(trf_conj,
                                                 trf_conj.output,
                                                 matrix_b_p=trf_conj.input)

            plan.computation_call(mmul, alpha, alpha_i, unitary)
            plan.computation_call(mmul_beta, beta, beta_i, unitary)

        else:

            noise_matrix = system.noise_matrix()
            noise_matrix_dev = plan.persistent_array(noise_matrix)

            # If we're here, it's not positive-P, and alpha == conj(beta).
            # This means we can just calculate alpha, and then build beta from it.

            w = plan.temp_array_like(alpha)
            temp_alpha = plan.temp_array_like(alpha)

            plan.computation_call(mmul, temp_alpha, alpha_i, unitary)

            bijection = philox(64, 2)

            # Keeping the kernel the same so it can be cached.
            # The seed will be passed as the computation parameter instead.
            keygen = KeyGenerator.create(bijection, seed=numpy.int32(0))

            sampler = normal_bm(bijection, numpy.float64)

            plan.kernel_call(TEMPLATE.get_def("generate_apply_matrix_noise"),
                             [w, seed],
                             kernel_name="generate_apply_matrix_noise",
                             global_size=alpha.shape,
                             render_kwds=dict(
                                 bijection=bijection,
                                 keygen=keygen,
                                 sampler=sampler,
                                 mul_cr=functions.mul(numpy.complex128,
                                                      numpy.float64),
                                 add_cc=functions.add(numpy.complex128,
                                                      numpy.complex128),
                             ))

            noise = plan.temp_array_like(alpha)
            plan.computation_call(mmul, noise, w, noise_matrix_dev)

            plan.kernel_call(TEMPLATE.get_def("add_noise"),
                             [alpha, beta, temp_alpha, noise],
                             kernel_name="add_noise",
                             global_size=alpha.shape,
                             render_kwds=dict(
                                 add=functions.add(numpy.complex128,
                                                   numpy.complex128),
                                 conj=functions.conj(numpy.complex128)))

        return plan

示例#48

0

显示文件

文件： test_modules.py 项目： xexo7C8/reikna

def multiplier(dtype, num=1):
    mul = functions.mul(dtype, dtype, out_dtype=dtype)
    return Module(TEMPLATE.get_def('multiplier'),
                  render_kwds=dict(dtype=dtype, num=num, mul=mul))

示例#49

0

显示文件

文件： polynomial_transform_fft.py 项目： stjordanis/nufhe

def transformed_mul(perf_params):
    return functions.mul(transformed_dtype(), transformed_dtype())

示例#50

0

显示文件

文件： test_modules.py 项目： SyamGadde/reikna

def multiplier(dtype, num=1):
    mul = functions.mul(dtype, dtype, out_dtype=dtype)
    return Module(
        TEMPLATE.get_def('multiplier'),
        render_kwds=dict(dtype=dtype, num=num, mul=mul))

示例#51

0

显示文件

文件： isaacudaimpl.py 项目： kaizhongkaizhong/WaveSyn

def get_procs(thr, N):
    fft = FFTFactory.create(thr, (N, ), compile_=False)
    unimod_trans = Transformation(
        [
            Parameter('output', Annotation(Type(np.complex128, N), 'o')),
            Parameter('input', Annotation(Type(np.complex128, N), 'i'))
        ],
        """
VSIZE_T idx = ${idxs[0]};
${input.ctype} val = ${input.load_same};
if (idx>${N}/2){
    val.x = 0.0;
    val.y = 0.0;
    ${output.store_same}(val);
}else
    ${output.store_same}(${polar_unit}(atan2(val.y, val.x)));
        """,
        render_kwds=dict(polar_unit=functions.polar_unit(dtype=np.float64),
                         N=N))
    fft.parameter.output.connect(unimod_trans,
                                 unimod_trans.input,
                                 uni=unimod_trans.output)
    fft_unimod = fft.compile(thr)

    mag_square = PureParallel([
        Parameter('output', Annotation(Type(np.complex128, N), 'o')),
        Parameter('input', Annotation(Type(np.complex128, N), 'i'))
    ], '''
VSIZE_T idx = ${idxs[0]};
${input.ctype} val = ${input.load_idx}(idx);  
val.x = val.x*val.x + val.y*val.y;
val.y = 0;
${output.store_idx}(idx, val);
        ''')
    mag_square = mag_square.compile(thr)

    apply_mask = PureParallel(
        [
            Parameter('output', Annotation(Type(np.complex128, N), 'o')),
            Parameter('origin', Annotation(Type(np.complex128, N), 'i')),
            Parameter('mask', Annotation(Type(np.double, N), 'i'))
        ],
        '''
VSIZE_T idx = ${idxs[0]};
${output.store_idx}(idx, ${mul}(${origin.load_idx}(idx), ${mask.load_idx}(idx)));        
        ''',
        render_kwds=dict(mul=functions.mul(np.complex128, np.double)))
    apply_mask = apply_mask.compile(thr)

    combine_mag_phi = PureParallel([
        Parameter('output', Annotation(Type(np.complex128, N), 'o')),
        Parameter('mag_square', Annotation(Type(np.complex128, N), 'i')),
        Parameter('phase', Annotation(Type(np.complex128, N), 'i'))
    ],
                                   '''
VSIZE_T idx = ${idxs[0]};
double r = ${mag_square.load_idx}(idx).x;  
r = r<0.0 ? 0.0 : ${pow}(r, 0.5);
double2 v = ${phase.load_idx}(idx);
double angle = atan2(v.y, v.x);
${output.store_idx}(idx, ${polar}(r, angle));
        ''',
                                   render_kwds=dict(
                                       pow=functions.pow(np.double),
                                       polar=functions.polar(np.double)))
    combine_mag_phi = combine_mag_phi.compile(thr)

    return fft_unimod, mag_square, apply_mask, combine_mag_phi

示例#52

0

显示文件

文件： rk46nl_stepper.py 项目： fjarri/reikna-integrator

def get_xpropagate(state_type, drift, diffusion=None, noise_type=None):

    real_dtype = dtypes.real_for(state_type.dtype)
    if diffusion is not None:
        noise_dtype = noise_type.dtype
    else:
        noise_dtype = real_dtype

    return PureParallel(
        [
            Parameter('output', Annotation(state_type, 'o')),
            Parameter('omega', Annotation(state_type, 'io')),
            Parameter('input', Annotation(state_type, 'i')),
            Parameter('kinput', Annotation(state_type, 'i'))]
            + ([Parameter('dW', Annotation(noise_type, 'i'))] if diffusion is not None else []) +
            [Parameter('ai', Annotation(real_dtype)),
            Parameter('bi', Annotation(real_dtype)),
            Parameter('ci', Annotation(real_dtype)),
            Parameter('t', Annotation(real_dtype)),
            Parameter('dt', Annotation(real_dtype)),
            Parameter('stage', Annotation(numpy.int32))],
        """
        <%
            coords = ", ".join(idxs[1:])
            trajectory = idxs[0]
            components = drift.components
            if diffusion is not None:
                noise_sources = diffusion.noise_sources
            psi_args = ", ".join("psi_" + str(c) for c in range(components))

            if diffusion is None:
                dW = None
        %>

        %for comp in range(components):
        ${output.ctype} omega_${comp};
        if (${stage} == 0)
        {
            omega_${comp} = ${dtypes.c_constant(0, output.dtype)};
        }
        else
        {
            omega_${comp} = ${omega.load_idx}(${trajectory}, ${comp}, ${coords});
        }
        ${output.ctype} psi_${comp} = ${input.load_idx}(${trajectory}, ${comp}, ${coords});
        ${output.ctype} kpsi_${comp} = ${kinput.load_idx}(${trajectory}, ${comp}, ${coords});
        ${output.ctype} dpsi_${comp};
        %endfor

        %if diffusion is not None:
        %for ncomp in range(noise_sources):
        ${dW.ctype} dW_${ncomp} = ${dW.load_idx}(${trajectory}, ${ncomp}, ${coords});
        %endfor
        %endif

        %for comp in range(components):
        dpsi_${comp} =
            kpsi_${comp}
            + ${mul_cr}(
                + ${drift.module}${comp}(${coords}, ${psi_args}, ${t} + ${dt} * ${ci}),
                ${dt})
            %if diffusion is not None:
            %for ncomp in range(noise_sources):
            + ${mul_cn}(${diffusion.module}${comp}_${ncomp}(
                ${coords}, ${psi_args}, ${t} + ${dt} * ${ci}), dW_${ncomp})
            %endfor
            %endif
            ;
        %endfor

        ${output.ctype} new_omega, new_u;
        %for comp in range(components):
        new_omega = ${mul_cr}(omega_${comp}, ${ai}) + dpsi_${comp};
        new_u = psi_${comp} + ${mul_cr}(new_omega, ${bi});
        if (${stage} < 5)
        {
            ${omega.store_idx}(${trajectory}, ${comp}, ${coords}, new_omega);
        }
        ${output.store_idx}(${trajectory}, ${comp}, ${coords}, new_u);
        %endfor
        """,
        guiding_array=(state_type.shape[0],) + state_type.shape[2:],
        render_kwds=dict(
            drift=drift,
            diffusion=diffusion,
            mul_cr=functions.mul(state_type.dtype, real_dtype),
            mul_cn=functions.mul(state_type.dtype, noise_dtype)))

示例#53

0

显示文件

文件： cdip_stepper.py 项目： fjarri/reikna-integrator

def get_prop_iter(state_type, drift, iterations, diffusion=None, noise_type=None):

    if dtypes.is_complex(state_type.dtype):
        real_dtype = dtypes.real_for(state_type.dtype)
    else:
        real_dtype = state_type.dtype

    if diffusion is not None:
        noise_dtype = noise_type.dtype
    else:
        noise_dtype = real_dtype

    return PureParallel(
        [
            Parameter('output', Annotation(state_type, 'o')),
            Parameter('input', Annotation(state_type, 'i'))]
            + ([Parameter('dW', Annotation(noise_type, 'i'))] if diffusion is not None else []) +
            [Parameter('t', Annotation(real_dtype)),
            Parameter('dt', Annotation(real_dtype))],
        """
        <%
            coords = ", ".join(idxs[1:])
            trajectory = idxs[0]
            components = drift.components
            if diffusion is not None:
                noise_sources = diffusion.noise_sources
            psi_args = ", ".join("psi_" + str(c) + "_tmp" for c in range(components))

            if diffusion is None:
                dW = None
        %>

        %for comp in range(components):
        ${output.ctype} psi_${comp} = ${input.load_idx}(${trajectory}, ${comp}, ${coords});
        ${output.ctype} psi_${comp}_tmp = psi_${comp};
        ${output.ctype} dpsi_${comp};
        %endfor

        %if diffusion is not None:
        %for ncomp in range(noise_sources):
        ${dW.ctype} dW_${ncomp} = ${dW.load_idx}(${trajectory}, ${ncomp}, ${coords});
        %endfor
        %endif

        %for i in range(iterations):

        %for comp in range(components):
        dpsi_${comp} =
            ${mul_cr}(
                ${mul_cr}(${drift.module}${comp}(
                    ${coords}, ${psi_args}, ${t} + ${dt} / 2), ${dt})
                %if diffusion is not None:
                %for ncomp in range(noise_sources):
                + ${mul_cn}(${diffusion.module}${comp}_${ncomp}(
                    ${coords}, ${psi_args}, ${t} + ${dt} / 2), dW_${ncomp})
                %endfor
                %endif
                , 0.5);
        %endfor

        %for comp in range(components):
        psi_${comp}_tmp = psi_${comp} + dpsi_${comp};
        %endfor

        %endfor

        %for comp in range(components):
        ${output.store_idx}(${trajectory}, ${comp}, ${coords}, psi_${comp}_tmp + dpsi_${comp});
        %endfor
        """,
        guiding_array=(state_type.shape[0],) + state_type.shape[2:],
        render_kwds=dict(
            drift=drift,
            diffusion=diffusion,
            iterations=iterations,
            mul_cr=functions.mul(state_type.dtype, real_dtype),
            mul_cn=functions.mul(state_type.dtype, noise_dtype)))