def __call__(self, a, axis=None, dtype=None, out=None, keepdims=False): if not isinstance(a, cupy.ndarray): raise TypeError('Input type must be cupy.ndarray') if self.identity is None: assert a.size != 0 in_args = [a] if out is None: out_args = [] else: out_args = [out] internal.check_args_device(in_args + out_args) in_types, out_types, routine = self._guess_routine(in_args, dtype) axis = _get_axis(axis, a.ndim) out_shape = _get_out_shape(a.shape, axis, keepdims) out_args = elementwise._get_out_args(in_args, out_args, out_types, out_shape) in_args, in_shape = _get_trans_args(in_args, axis, in_args[0].shape) in_indexer = cindexer.Indexer(in_shape) out_indexer = cindexer.Indexer(out_shape) out_clp2_size = 2**int.bit_length(int(out_indexer.size - 1)) inout_args, is_ndarray = _get_inout_args(in_args, out_args, in_indexer, out_indexer, out_clp2_size, self._params, True) param_types = elementwise._get_kernel_param_types(inout_args) params = elementwise._get_kernel_params(self._params, is_ndarray, param_types) block_size = 512 reduce_type = routine[3] if reduce_type is None: reduce_type = elementwise._get_typename(out_types[0]) type_preamble = ( 'typedef {} type_in0_raw; typedef {} type_out0_raw;'.format( elementwise._get_typename(in_args[0].dtype), elementwise._get_typename(out_args[0].dtype))) kern = _make_reduction_function_kernel( self.name, block_size, reduce_type, params, self.identity, routine[0], routine[1], routine[2], type_preamble, self._input_expr, self._output_expr, self._preamble) shared_mem = 32 * block_size if out_clp2_size > 256: shared_mem = 0 # TODO(okuta) set actual size kern.linear_launch(max(out_indexer.size, block_size), inout_args, shared_mem=shared_mem, block_max_size=block_size) if len(out_args) == 1: return out_args[0] return tuple(out_args)
def __call__(self, *args, **kwargs): """Compiles and invokes the reduction kernel. The compilation runs only if the kernel is not cached. Note that the kernels with different argument dtypes, ndims, or axis are not compatible. It means that single ReductionKernel object may be compiled into multiple kernel binaries. Args: args: Arguments of the kernel. Returns: Arrays are returned according to the ``out_params`` argument of the ``__init__`` method. """ out = kwargs.pop('out', None) axis = kwargs.get('axis', None) keepdims = kwargs.get('keepdims', False) if not (len(args) == self.nin or len(args) == self.nin + self.nout): raise TypeError('Wrong number of arguments for %s' % self.name) assert all(i is not None for i in args) out_args = list(args[self.nin:]) if out is not None: if self.nout != 1: raise NotImplementedError('') if len(out_args) != 0: raise ValueError("cannot specify 'out' as both " "a positional and keyword argument") out_args = [out] brod, in_args = elementwise._broadcast(args, self.in_params) internal.check_args_device(in_args + out_args) if self.identity is None: assert brod.size != 0 in_types, out_types, types = elementwise._decide_params_type( self.in_params, self.out_params, elementwise._get_ndarray_dtype(in_args), elementwise._get_ndarray_dtype(out_args)) axis = _get_axis(axis, brod.nd) out_shape = _get_out_shape(brod.shape, axis, keepdims) in_args = [x if isinstance(x, cupy.ndarray) else t.type(x) for x, t in six.moves.zip(in_args, in_types)] in_args, in_shape = _get_trans_args( in_args, axis, brod.shape, self.in_params) out_args = elementwise._get_out_args( in_args, out_args, out_types, out_shape, self.out_params) in_indexer = cindexer.Indexer(in_shape) out_indexer = cindexer.Indexer(out_shape) out_clp2_size = 2 ** int.bit_length(int(out_indexer.size - 1)) inout_args, is_ndarray = _get_inout_args( in_args, out_args, in_indexer, out_indexer, out_clp2_size, self.params, self.reduce_dims) param_types = elementwise._get_kernel_param_types(inout_args) exprs = _get_reduction_kernel( self.params, is_ndarray, param_types, types) block_size = 512 kern = _make_reduction_function_kernel( self.name, block_size, self.reduce_type, exprs[0], self.identity, self.map_expr, self.reduce_expr, self.post_map_expr, exprs[1], exprs[2], exprs[3], self.preamble) shared_mem = 32 * block_size if out_clp2_size > 256: shared_mem = 0 # TODO(okuta) set actual size kern.linear_launch(max(out_indexer.size, block_size), inout_args, shared_mem=shared_mem, block_max_size=block_size) return out_args[0]
def __call__(self, a, axis=None, dtype=None, out=None, keepdims=False): if not isinstance(a, cupy.ndarray): raise TypeError('Input type must be cupy.ndarray') if self.identity is None: assert a.size != 0 in_args = [a] if out is None: out_args = [] else: out_args = [out] internal.check_args_device(in_args + out_args) in_types, out_types, routine = self._guess_routine(in_args, dtype) axis = _get_axis(axis, a.ndim) out_shape = _get_out_shape(a.shape, axis, keepdims) out_args = elementwise._get_out_args( in_args, out_args, out_types, out_shape) in_args, in_shape = _get_trans_args( in_args, axis, in_args[0].shape) in_indexer = cindexer.Indexer(in_shape) out_indexer = cindexer.Indexer(out_shape) out_clp2_size = 2 ** int.bit_length(int(out_indexer.size - 1)) inout_args, is_ndarray = _get_inout_args( in_args, out_args, in_indexer, out_indexer, out_clp2_size, self._params, True) param_types = elementwise._get_kernel_param_types(inout_args) params = elementwise._get_kernel_params( self._params, is_ndarray, param_types) block_size = 512 reduce_type = routine[3] if reduce_type is None: reduce_type = elementwise._get_typename(out_types[0]) type_preamble = ( 'typedef {} type_in0_raw; typedef {} type_out0_raw;'.format( elementwise._get_typename(in_args[0].dtype), elementwise._get_typename(out_args[0].dtype))) kern = _make_reduction_function_kernel( self.name, block_size, reduce_type, params, self.identity, routine[0], routine[1], routine[2], type_preamble, self._input_expr, self._output_expr, self._preamble) shared_mem = 32 * block_size if out_clp2_size > 256: shared_mem = 0 # TODO(okuta) set actual size kern.linear_launch(max(out_indexer.size, block_size), inout_args, shared_mem=shared_mem, block_max_size=block_size) if len(out_args) == 1: return out_args[0] return tuple(out_args)