def __init__(self, name, reduce_func, expr, in_param, out_param, axis): """Reduction operation. """ _fusion_thread_local.check_not_runtime() assert isinstance(name, str) assert isinstance(reduce_func, _reduction._SimpleReductionKernel) assert isinstance(in_param, _TraceArray) assert isinstance(out_param, _TraceArray) assert isinstance(axis, tuple) assert all(0 <= x < in_param.ndim for x in axis) self.name = name self.preamble = reduce_func.preamble self.in_params = _VariableSet(in_param) self.out_params = _VariableSet(out_param) self.block_stride_name = 'block_stride_' + name self.axis = axis if reduce_func.identity is None: self.identity = '' else: self.identity = str(reduce_func.identity) _, self.expr, self.postmap_cast_code, self.reduce_ctype = expr if self.reduce_ctype is None: out_param, = self.out_params self.reduce_ctype = get_typename(out_param.dtype) self.premap_op = None self.postmap_op = None
def _arg_minor_reduce(self, ufunc, axis): """Reduce nonzeros with a ufunc over the minor axis when non-empty Can be applied to a function of self.data by supplying data parameter. Warning: this does not call sum_duplicates() Args: ufunc (object): Function handle giving the operation to be conducted. axis (int): Maxtrix over which the reduction should be conducted Returns: (cupy.ndarray): Reduce result for nonzeros in each major_index """ # Call to the appropriate kernel function # Create the vector to hold output # Note: it's important to set "int" here, following what SciPy # does, as the outcome dtype is platform dependent out_shape = self.shape[1 - axis] out = cupy.zeros(out_shape, dtype=int) # Perform the calculation ker_name = '_arg_reduction<{}, {}>'.format( _scalar.get_typename(self.data.dtype), _scalar.get_typename(out.dtype)) if ufunc == cupy.argmax: ker = self._max_arg_reduction_mod.get_function('max' + ker_name) elif ufunc == cupy.argmin: ker = self._min_arg_reduction_mod.get_function('min' + ker_name) ker((out_shape,), (1,), (self.data, self.indices, self.indptr[:len(self.indptr) - 1], self.indptr[1:], cupy.int64(self.shape[axis]), out)) return out
def emit_code(self): """Returns a CUDA device function code. Returns a string like: ``` __device__ void cupy_add_0(int &in0_, float &in1_, double &out0_) { typedef double in0_type; typedef double in1_type; typedef double out0_type; double in0 = (double) in0_; double in1 = (double) in1_; double out0 = (double) out0_; out0 = in0 + in1; out0_ = out0; } ``` """ nin = len(self.in_params) dtypes = self.compute_dtypes assert len(self.in_params) == len(self.compute_dtypes[:nin]) in_params = [ (get_typename(p.dtype), get_typename(t), 'in{}'.format(i)) for i, (p, t) in enumerate(zip(self.in_params, dtypes[:nin])) ] out_params = [ (get_typename(p.dtype), get_typename(t), 'out{}'.format(i)) for i, (p, t) in enumerate(zip(self.out_params, dtypes[nin:])) ] params = in_params + out_params params_code = ', '.join(['{} &{}_'.format(t, s) for t, _, s in params]) typedef = ['typedef {} {}_type;'.format(t, s) for _, t, s in params] read = ['{} {} = ({}) {}_;'.format(t, s, t, s) for _, t, s in params] write = ['{}_ = {};'.format(s, s) for _, _, s in out_params] return _codeblock.CodeBlock( '__device__ void {}({})'.format(self.name, params_code), typedef + read + [self.routine_code + ';'] + write)
def _set_dtype_to_astype_dict(): """Set a dict with dtypes and astype ufuncs to `_dtype_to_astype_dict`. Creates a ufunc for type cast operations, and set a dict with keys as the dtype of the output array and values as astype ufuncs. This function is called at most once. """ global _dtype_to_astype_dict _dtype_to_astype_dict = {} dtype_list = [numpy.dtype(type_char) for type_char in '?bhilqBHILQefdFD'] for t in dtype_list: name = 'astype_{}'.format(t) rules = tuple(['{}->{}'.format(s.char, t.char) for s in dtype_list]) command = 'out0 = static_cast< {} >(in0)'.format(get_typename(t)) _dtype_to_astype_dict[t] = core.create_ufunc(name, rules, command)
def __str__(self): ctype = get_typename(self.dtype) c_contiguous = get_cuda_code_from_constant(self._c_contiguous, bool_) index_32_bits = get_cuda_code_from_constant(self._index_32_bits, bool_) return f'CArray<{ctype}, {self.ndim}, {c_contiguous}, {index_32_bits}>'
def __str__(self): dtype = self.dtype if dtype == numpy.float16: # For the performance dtype = numpy.dtype('float32') return get_typename(dtype)
def emit_submodule_codes(self): """Returns a CUDA device function code. The emitted code assumes that ``block_stride`` and `blockDim.x` is a power of 2. """ in_param, = self.in_params out_param, = self.out_params op_name = '{}_op'.format(self.name) postmap_name = '{}_postmap'.format(self.name) template = string.Template(''' #define ${op_name}(a, b) (${reduce_expr}) #define ${postmap_name}(a, out0) (${postmap_cast}) template <typename InType, typename OutType, typename InIndexerType, typename OutIndexerType> __device__ void ${name}( InType in_arr, OutType out_arr, InIndexerType in_ind, OutIndexerType out_ind, int block_stride) { typedef ${in_type} type_in0_raw; typedef ${out_type} type_out0_raw; typedef ${reduce_ctype} _type_reduce; extern __shared__ char _sdata_raw[]; _type_reduce *sdata = reinterpret_cast<_type_reduce*>(_sdata_raw); unsigned int tid = threadIdx.x; int _J = tid >> __popc(block_stride - 1); ptrdiff_t _j = (ptrdiff_t)_J * out_ind.size(); int J_stride = blockDim.x >> __popc(block_stride - 1); ptrdiff_t j_stride = (ptrdiff_t)J_stride * out_ind.size(); for (ptrdiff_t _i = (ptrdiff_t)blockIdx.x * block_stride; _i < out_ind.size(); _i += (ptrdiff_t)gridDim.x * block_stride) { _type_reduce s = _type_reduce(${identity}); ptrdiff_t i = _i + (tid & (block_stride - 1)); for (ptrdiff_t j = i + _j; j < in_ind.size(); j += j_stride) { in_ind.set(j); s = ${op_name}(s, static_cast<_type_reduce>(in_arr[in_ind.get()])); } sdata[tid] = s; __syncthreads(); for (unsigned int block = blockDim.x / 2; block >= block_stride; block >>= 1) { if (tid < block) { sdata[tid] = ${op_name}(sdata[tid], sdata[tid + block]); } __syncthreads(); } if (tid < block_stride) { s = sdata[tid]; } if (tid < block_stride && i < out_ind.size()) { out_ind.set(i); ${postmap_name}(s, out_arr[out_ind.get()]); } __syncthreads(); } }''') # NOQA code = template.substitute(name=self.name, op_name=op_name, postmap_name=postmap_name, in_type=get_typename(in_param.dtype), out_type=get_typename(out_param.dtype), reduce_ctype=self.reduce_ctype, reduce_expr=self.expr, identity=self.identity, postmap_cast=self.postmap_cast_code) return [code]