예제 #1
0
    def __call__(self, a, axis=None, dtype=None, out=None, keepdims=False):
        if not isinstance(a, cupy.ndarray):
            raise TypeError('Input type must be cupy.ndarray')

        if self.identity is None:
            assert a.size != 0
        in_args = [a]
        if out is None:
            out_args = []
        else:
            out_args = [out]
        internal.check_args_device(in_args + out_args)

        in_types, out_types, routine = self._guess_routine(in_args, dtype)

        axis = _get_axis(axis, a.ndim)
        out_shape = _get_out_shape(a.shape, axis, keepdims)
        out_args = elementwise._get_out_args(in_args, out_args, out_types,
                                             out_shape)
        in_args, in_shape = _get_trans_args(in_args, axis, in_args[0].shape)

        in_indexer = cindexer.Indexer(in_shape)
        out_indexer = cindexer.Indexer(out_shape)
        out_clp2_size = 2**int.bit_length(int(out_indexer.size - 1))

        inout_args, is_ndarray = _get_inout_args(in_args, out_args, in_indexer,
                                                 out_indexer, out_clp2_size,
                                                 self._params, True)
        param_types = elementwise._get_kernel_param_types(inout_args)
        params = elementwise._get_kernel_params(self._params, is_ndarray,
                                                param_types)

        block_size = 512
        reduce_type = routine[3]
        if reduce_type is None:
            reduce_type = elementwise._get_typename(out_types[0])

        type_preamble = (
            'typedef {} type_in0_raw; typedef {} type_out0_raw;'.format(
                elementwise._get_typename(in_args[0].dtype),
                elementwise._get_typename(out_args[0].dtype)))

        kern = _make_reduction_function_kernel(
            self.name, block_size, reduce_type, params, self.identity,
            routine[0], routine[1], routine[2], type_preamble,
            self._input_expr, self._output_expr, self._preamble)
        shared_mem = 32 * block_size
        if out_clp2_size > 256:
            shared_mem = 0
        # TODO(okuta) set actual size
        kern.linear_launch(max(out_indexer.size, block_size),
                           inout_args,
                           shared_mem=shared_mem,
                           block_max_size=block_size)

        if len(out_args) == 1:
            return out_args[0]
        return tuple(out_args)
예제 #2
0
    def __call__(self, *args, **kwargs):
        """Applies the universal function to arguments elementwise.

        Args:
            args: Input arguments. Each of them can be a cupy.ndarray object or
                a scalar. The output arguments can be omitted or be specified
                by the ``out`` argument.
            out (cupy.ndarray): Output array. It outputs to new arrays
                default.
            dtype: Data type specifier.

        Returns:
            Output array or a tuple of output arrays.

        """
        out = kwargs.get('out', None)
        dtype = kwargs.get('dtype', None)

        if not (len(args) == self.nin or len(args) == self.nargs):
            raise TypeError('Wrong number of arguments for %s' % self.name)
        assert all(i is not None for i in args)

        brod = cupy.broadcast(*args)
        in_args = brod.values[:self.nin]
        out_args = list(args[self.nin:])
        if out is not None:
            assert len(out_args) == 0
            internal.check_args_device((out, ))
            out_args = [out]
        internal.check_args_device(in_args + out_args)

        in_types, out_types, routine = self._guess_routine(in_args, dtype)

        in_args = [
            x if isinstance(x, cupy.ndarray) else t.type(x)
            for x, t in six.moves.zip(in_args, in_types)
        ]
        out_args = _get_out_args(in_args, out_args, out_types, brod.shape)

        if len(out_args) == 1:
            ret = out_args[0]
        else:
            ret = tuple(out_args)

        if 0 in brod.shape:
            return ret

        indexer = cindexer.Indexer(brod.shape)
        inout_args, is_ndarray = _get_inout_args(in_args + out_args, indexer,
                                                 self._params, True)
        param_types = _get_kernel_param_types(inout_args)
        out_raw_types = tuple(x.dtype for x in out_args)
        kern = _get_ufunc_kernel(in_types, out_types, out_raw_types,
                                 is_ndarray, param_types, self._params,
                                 routine, self.name, self._preamble)

        kern.linear_launch(indexer.size, inout_args)
        return ret
예제 #3
0
    def __call__(self, *args, **kwargs):
        """Applies the universal function to arguments elementwise.

        Args:
            args: Input arguments. Each of them can be a cupy.ndarray object or
                a scalar. The output arguments can be omitted or be specified
                by the ``out`` argument.
            out (cupy.ndarray): Output array. It outputs to new arrays
                default.
            dtype: Data type specifier.

        Returns:
            Output array or a tuple of output arrays.

        """
        out = kwargs.get('out', None)
        dtype = kwargs.get('dtype', None)

        if not (len(args) == self.nin or len(args) == self.nargs):
            raise TypeError('Wrong number of arguments for %s' % self.name)
        assert all(i is not None for i in args)

        brod = cupy.broadcast(*args)
        in_args = brod.values[:self.nin]
        out_args = list(args[self.nin:])
        if out is not None:
            assert len(out_args) == 0
            internal.check_args_device((out,))
            out_args = [out]
        internal.check_args_device(in_args + out_args)

        in_types, out_types, routine = self._guess_routine(in_args, dtype)

        in_args = [x if isinstance(x, cupy.ndarray) else t.type(x)
                   for x, t in six.moves.zip(in_args, in_types)]
        out_args = _get_out_args(in_args, out_args, out_types, brod.shape)

        if len(out_args) == 1:
            ret = out_args[0]
        else:
            ret = tuple(out_args)

        if 0 in brod.shape:
            return ret

        indexer = cindexer.Indexer(brod.shape)
        inout_args, is_ndarray = _get_inout_args(
            in_args + out_args, indexer, self._params, True)
        param_types = _get_kernel_param_types(inout_args)
        out_raw_types = tuple(x.dtype for x in out_args)
        kern = _get_ufunc_kernel(
            in_types, out_types, out_raw_types,
            is_ndarray, param_types, self._params,
            routine, self.name, self._preamble)

        kern.linear_launch(indexer.size, inout_args)
        return ret
예제 #4
0
    def __call__(self, *args, **kwargs):
        """Compiles and invokes the reduction kernel.

        The compilation runs only if the kernel is not cached. Note that the
        kernels with different argument dtypes, ndims, or axis are not
        compatible. It means that single ReductionKernel object may be compiled
        into multiple kernel binaries.

        Args:
            args: Arguments of the kernel.

        Returns:
            Arrays are returned according to the ``out_params`` argument of the
            ``__init__`` method.

        """

        out = kwargs.pop('out', None)
        axis = kwargs.get('axis', None)
        keepdims = kwargs.get('keepdims', False)

        if not (len(args) == self.nin or
                len(args) == self.nin + self.nout):
            raise TypeError('Wrong number of arguments for %s' % self.name)
        assert all(i is not None for i in args)

        out_args = list(args[self.nin:])
        if out is not None:
            if self.nout != 1:
                raise NotImplementedError('')
            if len(out_args) != 0:
                raise ValueError("cannot specify 'out' as both "
                                 "a positional and keyword argument")
            out_args = [out]

        brod, in_args = elementwise._broadcast(args, self.in_params)

        internal.check_args_device(in_args + out_args)

        if self.identity is None:
            assert brod.size != 0
        in_types, out_types, types = elementwise._decide_params_type(
            self.in_params, self.out_params,
            elementwise._get_ndarray_dtype(in_args),
            elementwise._get_ndarray_dtype(out_args))

        axis = _get_axis(axis, brod.nd)
        out_shape = _get_out_shape(brod.shape, axis, keepdims)
        in_args = [x if isinstance(x, cupy.ndarray) else t.type(x)
                   for x, t in six.moves.zip(in_args, in_types)]
        in_args, in_shape = _get_trans_args(
            in_args, axis, brod.shape, self.in_params)
        out_args = elementwise._get_out_args(
            in_args, out_args, out_types, out_shape, self.out_params)

        in_indexer = cindexer.Indexer(in_shape)
        out_indexer = cindexer.Indexer(out_shape)
        out_clp2_size = 2 ** int.bit_length(int(out_indexer.size - 1))

        inout_args, is_ndarray = _get_inout_args(
            in_args, out_args, in_indexer, out_indexer, out_clp2_size,
            self.params, self.reduce_dims)
        param_types = elementwise._get_kernel_param_types(inout_args)

        exprs = _get_reduction_kernel(
            self.params, is_ndarray, param_types, types)
        block_size = 512
        kern = _make_reduction_function_kernel(
            self.name, block_size, self.reduce_type, exprs[0], self.identity,
            self.map_expr, self.reduce_expr, self.post_map_expr,
            exprs[1], exprs[2], exprs[3], self.preamble)
        shared_mem = 32 * block_size
        if out_clp2_size > 256:
            shared_mem = 0
        # TODO(okuta) set actual size
        kern.linear_launch(max(out_indexer.size, block_size), inout_args,
                           shared_mem=shared_mem,
                           block_max_size=block_size)
        return out_args[0]
예제 #5
0
    def __call__(self, a, axis=None, dtype=None, out=None, keepdims=False):
        if not isinstance(a, cupy.ndarray):
            raise TypeError('Input type must be cupy.ndarray')

        if self.identity is None:
            assert a.size != 0
        in_args = [a]
        if out is None:
            out_args = []
        else:
            out_args = [out]
        internal.check_args_device(in_args + out_args)

        in_types, out_types, routine = self._guess_routine(in_args, dtype)

        axis = _get_axis(axis, a.ndim)
        out_shape = _get_out_shape(a.shape, axis, keepdims)
        out_args = elementwise._get_out_args(
            in_args, out_args, out_types, out_shape)
        in_args, in_shape = _get_trans_args(
            in_args, axis, in_args[0].shape)

        in_indexer = cindexer.Indexer(in_shape)
        out_indexer = cindexer.Indexer(out_shape)
        out_clp2_size = 2 ** int.bit_length(int(out_indexer.size - 1))

        inout_args, is_ndarray = _get_inout_args(
            in_args, out_args, in_indexer, out_indexer, out_clp2_size,
            self._params, True)
        param_types = elementwise._get_kernel_param_types(inout_args)
        params = elementwise._get_kernel_params(
            self._params, is_ndarray, param_types)

        block_size = 512
        reduce_type = routine[3]
        if reduce_type is None:
            reduce_type = elementwise._get_typename(out_types[0])

        type_preamble = (
            'typedef {} type_in0_raw; typedef {} type_out0_raw;'.format(
                elementwise._get_typename(in_args[0].dtype),
                elementwise._get_typename(out_args[0].dtype)))

        kern = _make_reduction_function_kernel(
            self.name,
            block_size,
            reduce_type,
            params,
            self.identity,
            routine[0], routine[1], routine[2],
            type_preamble, self._input_expr, self._output_expr,
            self._preamble)
        shared_mem = 32 * block_size
        if out_clp2_size > 256:
            shared_mem = 0
        # TODO(okuta) set actual size
        kern.linear_launch(max(out_indexer.size, block_size), inout_args,
                           shared_mem=shared_mem,
                           block_max_size=block_size)

        if len(out_args) == 1:
            return out_args[0]
        return tuple(out_args)
예제 #6
0
    def __call__(self, *args, **kwargs):
        """Compiles and invokes the elementwise kernel.

        The compilation runs only if the kernel is not cached. Note that the
        kernels with different argument dtypes or ndims are not compatible. It
        means that single ElementwiseKernel object may be compiled into
        multiple kernel binaries.

        Args:
            args: Argumens of the kernel.
            size (int): Range size of the indices. If specified, the variable
                ``n`` is set to this value. Otherwise, the result of
                broadcasting is used to determine the value of ``n``.

        Returns:
            Arrays are returned according to the ``out_params`` argument of the
            ``__init__`` method.

        """
        n = kwargs.pop('size', None)

        if not (len(args) == self.nin or
                len(args) == self.nin + self.nout):
            raise TypeError('Wrong number of arguments for %s' % self.name)
        for i in args:
            if isinstance(i, numpy.ndarray):
                raise TypeError('Unsupported type %s' % type(i))
        assert not any(i is None for i in args)
        internal.check_args_device(args)

        brod, value = _broadcast(args, self.params, n is None)
        in_args = value[:self.nin]
        out_args = value[self.nin:]
        in_types, out_types, types = _decide_params_type(
            self.in_params, self.out_params,
            _get_ndarray_dtype(in_args), _get_ndarray_dtype(out_args))

        in_args = [x if isinstance(x, cupy.ndarray) else t.type(x)
                   for x, t in six.moves.zip(in_args, in_types)]
        out_args = _get_out_args(
            in_args, out_args, out_types, brod.shape, self.out_params)

        if len(out_args) == 1:
            ret = out_args[0]
        else:
            ret = tuple(out_args)

        if n is None:
            indexer = cindexer.Indexer(brod.shape)
        else:
            indexer = cindexer.Indexer((n,))

        if brod.size == 0:
            return ret

        inout_args, is_ndarray = _get_inout_args(
            in_args + out_args, indexer, self.params, self.reduce_dims)
        param_types = _get_kernel_param_types(inout_args)
        kern = _get_elementwise_kernel(
            self.params, is_ndarray, param_types, types, self.operation,
            self.name, self.options, **self.kwargs)
        kern.linear_launch(indexer.size, inout_args)
        return ret
예제 #7
0
    def __call__(self, *args, **kwargs):
        """Compiles and invokes the elementwise kernel.

        The compilation runs only if the kernel is not cached. Note that the
        kernels with different argument dtypes or ndims are not compatible. It
        means that single ElementwiseKernel object may be compiled into
        multiple kernel binaries.

        Args:
            args: Argumens of the kernel.
            size (int): Range size of the indices. If specified, the variable
                ``n`` is set to this value. Otherwise, the result of
                broadcasting is used to determine the value of ``n``.

        Returns:
            Arrays are returned according to the ``out_params`` argument of the
            ``__init__`` method.

        """
        n = kwargs.pop('size', None)

        if not (len(args) == self.nin or len(args) == self.nin + self.nout):
            raise TypeError('Wrong number of arguments for %s' % self.name)
        for i in args:
            if isinstance(i, numpy.ndarray):
                raise TypeError('Unsupported type %s' % type(i))
        assert not any(i is None for i in args)
        internal.check_args_device(args)

        brod, value = _broadcast(args, self.params, n is None)
        in_args = value[:self.nin]
        out_args = value[self.nin:]
        in_types, out_types, types = _decide_params_type(
            self.in_params, self.out_params, _get_ndarray_dtype(in_args),
            _get_ndarray_dtype(out_args))

        in_args = [
            x if isinstance(x, cupy.ndarray) else t.type(x)
            for x, t in six.moves.zip(in_args, in_types)
        ]
        out_args = _get_out_args(in_args, out_args, out_types, brod.shape,
                                 self.out_params)

        if len(out_args) == 1:
            ret = out_args[0]
        else:
            ret = tuple(out_args)

        if n is None:
            indexer = cindexer.Indexer(brod.shape)
        else:
            indexer = cindexer.Indexer((n, ))

        if brod.size == 0:
            return ret

        inout_args, is_ndarray = _get_inout_args(in_args + out_args, indexer,
                                                 self.params, self.reduce_dims)
        param_types = _get_kernel_param_types(inout_args)
        kern = _get_elementwise_kernel(self.params, is_ndarray, param_types,
                                       types, self.operation, self.name,
                                       self.options, **self.kwargs)
        kern.linear_launch(indexer.size, inout_args)
        return ret