Пример #1
0
    def __init__(self, name, reduce_func, expr, in_param, out_param, axis):
        """Reduction operation.
        """
        _fusion_thread_local.check_not_runtime()
        assert isinstance(name, str)
        assert isinstance(reduce_func, _reduction._SimpleReductionKernel)
        assert isinstance(in_param, _TraceArray)
        assert isinstance(out_param, _TraceArray)
        assert isinstance(axis, tuple)
        assert all(0 <= x < in_param.ndim for x in axis)

        self.name = name
        self.preamble = reduce_func.preamble
        self.in_params = _VariableSet(in_param)
        self.out_params = _VariableSet(out_param)
        self.block_stride_name = 'block_stride_' + name
        self.axis = axis

        if reduce_func.identity is None:
            self.identity = ''
        else:
            self.identity = str(reduce_func.identity)

        _, self.expr, self.postmap_cast_code, self.reduce_ctype = expr
        if self.reduce_ctype is None:
            out_param, = self.out_params
            self.reduce_ctype = get_typename(out_param.dtype)

        self.premap_op = None
        self.postmap_op = None
Пример #2
0
    def _arg_minor_reduce(self, ufunc, axis):
        """Reduce nonzeros with a ufunc over the minor axis when non-empty

        Can be applied to a function of self.data by supplying data parameter.
        Warning: this does not call sum_duplicates()

        Args:
            ufunc (object): Function handle giving the operation to be
                conducted.
            axis (int): Maxtrix over which the reduction should be conducted

        Returns:
            (cupy.ndarray): Reduce result for nonzeros in each
                major_index

        """

        # Call to the appropriate kernel function
        # Create the vector to hold output
        # Note: it's important to set "int" here, following what SciPy
        # does, as the outcome dtype is platform dependent
        out_shape = self.shape[1 - axis]
        out = cupy.zeros(out_shape, dtype=int)

        # Perform the calculation
        ker_name = '_arg_reduction<{}, {}>'.format(
            _scalar.get_typename(self.data.dtype),
            _scalar.get_typename(out.dtype))

        if ufunc == cupy.argmax:
            ker = self._max_arg_reduction_mod.get_function('max' + ker_name)
        elif ufunc == cupy.argmin:
            ker = self._min_arg_reduction_mod.get_function('min' + ker_name)

        ker((out_shape,), (1,),
            (self.data, self.indices,
             self.indptr[:len(self.indptr) - 1],
             self.indptr[1:], cupy.int64(self.shape[axis]),
             out))

        return out
Пример #3
0
    def emit_code(self):
        """Returns a CUDA device function code.

        Returns a string like:
        ```
        __device__ void cupy_add_0(int &in0_, float &in1_, double &out0_) {
            typedef double in0_type;
            typedef double in1_type;
            typedef double out0_type;
            double in0 = (double) in0_;
            double in1 = (double) in1_;
            double out0 = (double) out0_;
            out0 = in0 + in1;
            out0_ = out0;
        }
        ```
        """
        nin = len(self.in_params)
        dtypes = self.compute_dtypes
        assert len(self.in_params) == len(self.compute_dtypes[:nin])
        in_params = [
            (get_typename(p.dtype), get_typename(t), 'in{}'.format(i))
            for i, (p, t) in enumerate(zip(self.in_params, dtypes[:nin]))
        ]
        out_params = [
            (get_typename(p.dtype), get_typename(t), 'out{}'.format(i))
            for i, (p, t) in enumerate(zip(self.out_params, dtypes[nin:]))
        ]
        params = in_params + out_params

        params_code = ', '.join(['{} &{}_'.format(t, s) for t, _, s in params])
        typedef = ['typedef {} {}_type;'.format(t, s) for _, t, s in params]
        read = ['{} {} = ({}) {}_;'.format(t, s, t, s) for _, t, s in params]
        write = ['{}_ = {};'.format(s, s) for _, _, s in out_params]

        return _codeblock.CodeBlock(
            '__device__ void {}({})'.format(self.name, params_code),
            typedef + read + [self.routine_code + ';'] + write)
Пример #4
0
def _set_dtype_to_astype_dict():
    """Set a dict with dtypes and astype ufuncs to `_dtype_to_astype_dict`.

    Creates a ufunc for type cast operations, and set a dict with keys
    as the dtype of the output array and values as astype ufuncs.
    This function is called at most once.
    """
    global _dtype_to_astype_dict
    _dtype_to_astype_dict = {}

    dtype_list = [numpy.dtype(type_char) for type_char in '?bhilqBHILQefdFD']

    for t in dtype_list:
        name = 'astype_{}'.format(t)
        rules = tuple(['{}->{}'.format(s.char, t.char) for s in dtype_list])
        command = 'out0 = static_cast< {} >(in0)'.format(get_typename(t))
        _dtype_to_astype_dict[t] = core.create_ufunc(name, rules, command)
Пример #5
0
 def __str__(self):
     ctype = get_typename(self.dtype)
     c_contiguous = get_cuda_code_from_constant(self._c_contiguous, bool_)
     index_32_bits = get_cuda_code_from_constant(self._index_32_bits, bool_)
     return f'CArray<{ctype}, {self.ndim}, {c_contiguous}, {index_32_bits}>'
Пример #6
0
 def __str__(self):
     dtype = self.dtype
     if dtype == numpy.float16:
         # For the performance
         dtype = numpy.dtype('float32')
     return get_typename(dtype)
Пример #7
0
    def emit_submodule_codes(self):
        """Returns a CUDA device function code.

        The emitted code assumes that ``block_stride`` and `blockDim.x` is a
        power of 2.
        """

        in_param, = self.in_params
        out_param, = self.out_params
        op_name = '{}_op'.format(self.name)
        postmap_name = '{}_postmap'.format(self.name)

        template = string.Template('''
#define ${op_name}(a, b) (${reduce_expr})
#define ${postmap_name}(a, out0) (${postmap_cast})

template <typename InType, typename OutType, typename InIndexerType, typename OutIndexerType>
__device__ void ${name}(
        InType in_arr, OutType out_arr,
        InIndexerType in_ind, OutIndexerType out_ind, int block_stride) {
    typedef ${in_type} type_in0_raw;
    typedef ${out_type} type_out0_raw;
    typedef ${reduce_ctype} _type_reduce;
    extern __shared__ char _sdata_raw[];
    _type_reduce *sdata = reinterpret_cast<_type_reduce*>(_sdata_raw);
    unsigned int tid = threadIdx.x;
    int _J = tid >> __popc(block_stride - 1);
    ptrdiff_t _j = (ptrdiff_t)_J * out_ind.size();
    int J_stride = blockDim.x >> __popc(block_stride - 1);
    ptrdiff_t j_stride = (ptrdiff_t)J_stride * out_ind.size();

    for (ptrdiff_t _i = (ptrdiff_t)blockIdx.x * block_stride; _i < out_ind.size(); _i += (ptrdiff_t)gridDim.x * block_stride) {
        _type_reduce s = _type_reduce(${identity});
        ptrdiff_t i = _i + (tid & (block_stride - 1));
        for (ptrdiff_t j = i + _j; j < in_ind.size(); j += j_stride) {
            in_ind.set(j);
            s = ${op_name}(s, static_cast<_type_reduce>(in_arr[in_ind.get()]));
        }
        sdata[tid] = s;
        __syncthreads();
        for (unsigned int block = blockDim.x / 2; block >= block_stride; block >>= 1) {
            if (tid < block) {
                sdata[tid] = ${op_name}(sdata[tid], sdata[tid + block]);
            }
            __syncthreads();
        }
        if (tid < block_stride) {
            s = sdata[tid];
        }
        if (tid < block_stride && i < out_ind.size()) {
            out_ind.set(i);
            ${postmap_name}(s, out_arr[out_ind.get()]);
        }
        __syncthreads();
    }
}''')  # NOQA
        code = template.substitute(name=self.name,
                                   op_name=op_name,
                                   postmap_name=postmap_name,
                                   in_type=get_typename(in_param.dtype),
                                   out_type=get_typename(out_param.dtype),
                                   reduce_ctype=self.reduce_ctype,
                                   reduce_expr=self.expr,
                                   identity=self.identity,
                                   postmap_cast=self.postmap_cast_code)

        return [code]