예제 #1
0
파일: reduction.py 프로젝트: thecobb/PyCUDA
    def __call__(self, *args, **kwargs):
        MAX_BLOCK_COUNT = 1024
        SMALL_SEQ_COUNT = 4

        s1_func = self.stage1_func
        s2_func = self.stage2_func

        kernel_wrapper = kwargs.get("kernel_wrapper")
        if kernel_wrapper is not None:
            s1_func = kernel_wrapper(s1_func)
            s2_func = kernel_wrapper(s2_func)

        stream = kwargs.get("stream")

        from gpuarray import empty

        f = s1_func
        arg_types = self.stage1_arg_types

        while True:
            invocation_args = []
            vectors = []

            for arg, arg_tp in zip(args, arg_types):
                if arg_tp == "P":
                    vectors.append(arg)
                    invocation_args.append(arg.gpudata)
                else:
                    invocation_args.append(arg)

            repr_vec = vectors[0]
            sz = repr_vec.size

            if sz <= self.block_size * SMALL_SEQ_COUNT * MAX_BLOCK_COUNT:
                total_block_size = SMALL_SEQ_COUNT * self.block_size
                block_count = (sz + total_block_size - 1) // total_block_size
                seq_count = SMALL_SEQ_COUNT
            else:
                block_count = MAX_BLOCK_COUNT
                macroblock_size = block_count * self.block_size
                seq_count = (sz + macroblock_size - 1) // macroblock_size

            if block_count == 1:
                result = empty((), self.dtype_out, repr_vec.allocator)
            else:
                result = empty((block_count, ), self.dtype_out,
                               repr_vec.allocator)

            #print block_count, seq_count, self.block_size
            f((block_count, 1), stream,
              *([result.gpudata] + invocation_args + [seq_count, sz]))

            if block_count == 1:
                return result
            else:
                f = s2_func
                arg_types = self.stage2_arg_types
                args = [result]
예제 #2
0
파일: reduction.py 프로젝트: minrk/PyCUDA
    def __call__(self, *args, **kwargs):
        MAX_BLOCK_COUNT = 1024
        SMALL_SEQ_COUNT = 4

        s1_func = self.stage1_func
        s2_func = self.stage2_func

        kernel_wrapper = kwargs.get("kernel_wrapper")
        if kernel_wrapper is not None:
            s1_func = kernel_wrapper(s1_func)
            s2_func = kernel_wrapper(s2_func)

        stream = kwargs.get("stream")

        from gpuarray import empty

        f = s1_func
        arg_types = self.stage1_arg_types

        while True:
            invocation_args = []
            vectors = []

            for arg, arg_tp in zip(args, arg_types):
                if arg_tp == "P":
                    vectors.append(arg)
                    invocation_args.append(arg.gpudata)
                else:
                    invocation_args.append(arg)

            repr_vec = vectors[0]
            sz = repr_vec.size

            if sz <= self.block_size*SMALL_SEQ_COUNT*MAX_BLOCK_COUNT:
                total_block_size = SMALL_SEQ_COUNT*self.block_size
                block_count = (sz + total_block_size - 1) // total_block_size
                seq_count = SMALL_SEQ_COUNT
            else:
                block_count = MAX_BLOCK_COUNT
                macroblock_size = block_count*self.block_size
                seq_count = (sz + macroblock_size - 1) // macroblock_size

            if block_count == 1:
                result = empty((), self.dtype_out, repr_vec.allocator)
            else:
                result = empty((block_count,), self.dtype_out, repr_vec.allocator)

            #print block_count, seq_count, self.block_size
            f((block_count, 1), stream,
                    *([result.gpudata]+invocation_args+[seq_count, sz]))

            if block_count == 1:
                return result
            else:
                f = s2_func
                arg_types = self.stage2_arg_types
                args = [result]
예제 #3
0
def elemwise2(a,
              op,
              b,
              ary,
              odtype=None,
              oper=None,
              op_tmpl="res[i] = (%(out_t)s)%(a)s %(op)s (%(out_t)s)%(b)s",
              broadcast=False):
    ndim_extend = True
    if not isinstance(a, gpuarray.GpuArray):
        a = numpy.asarray(a)
        ndim_extend = False
    if not isinstance(b, gpuarray.GpuArray):
        b = numpy.asarray(b)
        ndim_extend = False
    if odtype is None:
        odtype = get_common_dtype(a, b, True)

    a_arg = as_argument(a, 'a')
    b_arg = as_argument(b, 'b')

    args = [ArrayArg(odtype, 'res'), a_arg, b_arg]

    if ndim_extend:
        if a.ndim != b.ndim:
            nd = max(a.ndim, b.ndim)
            if a.ndim < nd:
                a = a.reshape(((1, ) * (nd - a.ndim)) + a.shape)
            if b.ndim < nd:
                b = b.reshape(((1, ) * (nd - b.ndim)) + b.shape)
        out_shape = tuple(max(sa, sb) for sa, sb in zip(a.shape, b.shape))
        res = gpuarray.empty(out_shape,
                             dtype=odtype,
                             context=ary.context,
                             cls=ary.__class__)
    else:
        res = ary._empty_like_me(dtype=odtype)

    if oper is None:
        oper = op_tmpl % {
            'a': a_arg.expr(),
            'op': op,
            'b': b_arg.expr(),
            'out_t': dtype_to_ctype(odtype)
        }

    k = ElemwiseKernel(ary.context, args, oper)
    k(res, a, b, broadcast=broadcast)
    return res
예제 #4
0
    def __call__(self, *args, **kwargs):
        _, nd, dims, strs, offsets, contig = check_args(args,
                                                        collapse=False,
                                                        broadcast=False)
        out = kwargs.pop('out', None)
        if len(kwargs) != 0:
            raise TypeError('Unexpected keyword argument: %s' %
                            kwargs.keys()[0])
        n = prod(dims)
        out_shape = tuple(d for i, d in enumerate(dims) if not self.redux[i])
        gs = prod(out_shape)
        if gs == 0:
            gs = 1
        n /= gs
        if gs > self.context.maxgsize:
            raise ValueError("Array to big to be reduced along the "
                             "selected axes")

        if out is None:
            out = gpuarray.empty(out_shape,
                                 context=self.context,
                                 dtype=self.dtype_out)
        else:
            if out.shape != out_shape or out.dtype != self.dtype_out:
                raise TypeError(
                    "Out array is not of expected type "
                    "(expected %s %s, got %s %s)" %
                    (out_shape, self.dtype_out, out.shape, out.dtype))
        #Don't compile and cache for nothing for big size
        if self.init_local_size < n:
            k, _, _, ls = self._get_basic_kernel(self.init_local_size, nd)
        else:
            k, _, _, ls = self._get_basic_kernel(n, nd)

        kargs = [n, out]
        kargs.extend(dims)
        for i, arg in enumerate(args):
            kargs.append(arg)
            if isinstance(arg, gpuarray.GpuArray):
                kargs.append(offsets[i])
                kargs.extend(strs[i])

        k(*kargs, ls=ls, gs=gs)

        return out
예제 #5
0
    def __call__(self, *args, **kwargs):
        _, nd, dims, strs, offsets, contig = check_args(args, collapse=False,
                                                        broadcast=False)
        out = kwargs.pop('out', None)
        if len(kwargs) != 0:
            raise TypeError('Unexpected keyword argument: %s' %
                            kwargs.keys()[0])
        n = prod(dims)
        out_shape = tuple(d for i, d in enumerate(dims) if not self.redux[i])
        gs = prod(out_shape)
        if gs == 0:
            gs = 1
        n /= gs
        if gs > self.context.maxgsize:
            raise ValueError("Array to big to be reduced along the "
                             "selected axes")


        if out is None:
            out = gpuarray.empty(out_shape, context=self.context,
                                 dtype=self.dtype_out)
        else:
            if out.shape != out_shape or out.dtype != self.dtype_out:
                raise TypeError("Out array is not of expected type "
                                "(expected %s %s, got %s %s)" % (
                        out_shape, self.dtype_out, out.shape, out.dtype))
        #Don't compile and cache for nothing for big size
        if self.init_local_size < n:
            k, _, _, ls = self._get_basic_kernel(self.init_local_size, nd)
        else:
            k, _, _, ls = self._get_basic_kernel(n, nd)

        kargs = [n, out]
        kargs.extend(dims)
        for i, arg in enumerate(args):
            kargs.append(arg)
            if isinstance(arg, gpuarray.GpuArray):
                kargs.append(offsets[i])
                kargs.extend(strs[i])

        k(*kargs, ls=ls, gs=gs)

        return out
예제 #6
0
def elemwise2(a, op, b, ary, odtype=None, oper=None,
              op_tmpl="res[i] = (%(out_t)s)%(a)s %(op)s (%(out_t)s)%(b)s",
              broadcast=False):
    ndim_extend = True
    if not isinstance(a, gpuarray.GpuArray):
        a = numpy.asarray(a)
        ndim_extend = False
    if not isinstance(b, gpuarray.GpuArray):
        b = numpy.asarray(b)
        ndim_extend = False
    if odtype is None:
        odtype = get_common_dtype(a, b, True)

    a_arg = as_argument(a, 'a')
    b_arg = as_argument(b, 'b')

    args = [ArrayArg(odtype, 'res'), a_arg, b_arg]

    if ndim_extend:
        if a.ndim != b.ndim:
            nd = max(a.ndim, b.ndim)
            if a.ndim < nd:
                a = a.reshape(((1,) * (nd - a.ndim))+a.shape)
            if b.ndim < nd:
                b = b.reshape(((1,) * (nd - b.ndim))+b.shape)
        out_shape = tuple(max(sa, sb) for sa, sb in zip(a.shape, b.shape))
        res = gpuarray.empty(out_shape, dtype=odtype, context=ary.context,
                             cls=ary.__class__)
    else:
        res = ary._empty_like_me(dtype=odtype)

    if oper is None:
        oper = op_tmpl % {'a': a_arg.expr(), 'op': op, 'b': b_arg.expr(),
                          'out_t': dtype_to_ctype(odtype)}

    k = ElemwiseKernel(ary.context, args, oper)
    k(res, a, b, broadcast=broadcast)
    return res
예제 #7
0
    def __call__(self, *args, **kwargs):
        MAX_BLOCK_COUNT = 1024
        SMALL_SEQ_COUNT = 4

        s1_func = self.stage1_func
        s2_func = self.stage2_func

        kernel_wrapper = kwargs.get("kernel_wrapper")
        if kernel_wrapper is not None:
            s1_func = kernel_wrapper(s1_func)
            s2_func = kernel_wrapper(s2_func)

        stream = kwargs.get("stream")

        from gpuarray import empty

        f = s1_func
        arg_types = self.stage1_arg_types

        stage1_args = args

        while True:
            invocation_args = []
            vectors = []

            for arg, arg_tp in zip(args, arg_types):
                if arg_tp == "P":
                    if not arg.flags.forc:
                        raise RuntimeError("ReductionKernel cannot "
                                "deal with non-contiguous arrays")

                    vectors.append(arg)
                    invocation_args.append(arg.gpudata)
                else:
                    invocation_args.append(arg)

            repr_vec = vectors[0]
            sz = repr_vec.size

            if sz <= self.block_size*SMALL_SEQ_COUNT*MAX_BLOCK_COUNT:
                total_block_size = SMALL_SEQ_COUNT*self.block_size
                block_count = (sz + total_block_size - 1) // total_block_size
                seq_count = SMALL_SEQ_COUNT
            else:
                block_count = MAX_BLOCK_COUNT
                macroblock_size = block_count*self.block_size
                seq_count = (sz + macroblock_size - 1) // macroblock_size

            if block_count == 1:
                result = empty((), self.dtype_out, repr_vec.allocator)
            else:
                result = empty((block_count,), self.dtype_out, repr_vec.allocator)

            kwargs = dict(shared_size=self.block_size*self.dtype_out.itemsize)

            #print block_count, seq_count, self.block_size, sz
            f((block_count, 1), (self.block_size, 1, 1), stream,
                    *([result.gpudata]+invocation_args+[seq_count, sz]),
                    **kwargs)

            if block_count == 1:
                return result
            else:
                f = s2_func
                arg_types = self.stage2_arg_types
                args = (result,) + stage1_args