예제 #1
0
def elemwise_layouts_mixed(shape, offseted_outer, offseted_inner, sliced,
                           order):
    ac, ag = gen_gpuarray(shape, dtype='float32', sliced=sliced, order=order,
                          offseted_outer=offseted_outer,
                          offseted_inner=offseted_inner, ctx=context)
    b = numpy.asarray(2.0, dtype='float32')

    outg = gpuarray.empty(shape, dtype='float32', context=context)

    k = ElemwiseKernel(context, "float *a, float b, float *c",
                       "c[i] = a[i] + b")
    # will use contig or basic
    k(ag, b, outg)
    outc = ac + b
    assert numpy.allclose(numpy.asarray(outg), outc)

    # test basic
    outg = gpuarray.empty(shape, dtype='float32', context=context)
    k.call_basic(ag, b, outg)
    assert numpy.allclose(numpy.asarray(outg), outc)

    # test dimspec
    outg = gpuarray.empty(shape, dtype='float32', context=context)
    k.call_dimspec(ag, b, outg)
    assert numpy.allclose(numpy.asarray(outg), outc)

    # test specialized
    outg = gpuarray.empty(shape, dtype='float32', context=context)
    k.call_specialized(ag, b, outg)
    assert numpy.allclose(numpy.asarray(outg), outc)
예제 #2
0
 def perform(self, node, inputs, outs):
     out, = outs
     v = inputs[0]
     sh = tuple(map(int, inputs[1:]))
     if out[0] is None or out[0].shape != sh:
         out[0] = gpuarray.empty(sh, dtype=v.dtype)
     out[0][...] = v
예제 #3
0
def test_reduction_wrong_type():
    c, g = gen_gpuarray((2, 3), dtype='float32', ctx=context, cls=elemary)
    out1 = gpuarray.empty((2, 3), dtype='int32', context=context)
    out2 = gpuarray.empty((3, 2), dtype='float32', context=context)

    try:
        r = g.sum(out=out1)
        assert False, "Expected a TypeError out of the sum"
    except TypeError:
        pass

    try:
        r = g.sum(out=out2)
        assert False, "Expected a TypeError out of the sum"
    except TypeError:
        pass
예제 #4
0
def test_reduction_wrong_type():
    c, g = gen_gpuarray((2, 3), dtype='float32', ctx=context, cls=elemary)
    out1 = gpuarray.empty((2, 3), dtype='int32', context=context)
    out2 = gpuarray.empty((3, 2), dtype='float32', context=context)

    try:
        g.sum(out=out1)
        assert False, "Expected a TypeError out of the sum"
    except TypeError:
        pass

    try:
        g.sum(out=out2)
        assert False, "Expected a TypeError out of the sum"
    except TypeError:
        pass
예제 #5
0
 def perform(self, node, inputs, outs):
     out, = outs
     v = inputs[0]
     sh = tuple(map(int, inputs[1:]))
     if out[0] is None or out[0].shape != sh:
         out[0] = gpuarray.empty(sh, dtype=v.dtype)
     out[0][...] = v
예제 #6
0
def test_hash():
    g = gpu_ndarray.empty((2, 3), context=ctx)
    exc = None
    try:
        h = hash(g)
    except TypeError, e:
        exc = e
예제 #7
0
def test_hash():
    g = gpu_ndarray.empty((2, 3), context=ctx)
    exc = None
    try:
        h = hash(g)
    except TypeError, e:
        exc = e
예제 #8
0
def test_elemwise_bool():
    a = gpuarray.empty((2,), context=context)
    exc = None
    try:
        bool(a)
    except ValueError, e:
        exc = e
예제 #9
0
def test_elemwise_bool():
    a = gpuarray.empty((2, ), context=context)
    exc = None
    try:
        bool(a)
    except ValueError as e:
        exc = e
    assert exc is not None
    a = gpuarray.zeros((1, ), context=context)
    assert not bool(a)
    a = gpuarray.zeros((), context=context)
    assert not bool(a)
예제 #10
0
def test_elemwise_bool():
    a = gpuarray.empty((2,), context=context)
    exc = None
    try:
        bool(a)
    except ValueError as e:
        exc = e
    assert exc is not None
    a = gpuarray.zeros((1,), context=context)
    assert not bool(a)
    a = gpuarray.zeros((), context=context)
    assert not bool(a)
예제 #11
0
 def perform(self, node, inputs, outs):
     out, = outs
     v = inputs[0]
     sh = tuple(map(int, inputs[1:]))
     if out[0] is None or out[0].shape != sh:
         if v.size == 1 and numpy.asarray(v)[0].item() == 0:
             out[0] = gpuarray.zeros(sh, dtype=v.dtype)
         else:
             out[0] = gpuarray.empty(sh, dtype=v.dtype)
             out[0][...] = v
     else:
         out[0][...] = v
     if config.gpuarray.sync:
         out[0].sync()
예제 #12
0
 def perform(self, node, inputs, outs):
     out, = outs
     v = inputs[0]
     sh = tuple(map(int, inputs[1:]))
     if out[0] is None or out[0].shape != sh:
         if self.memset_0:
             out[0] = gpuarray.zeros(sh, dtype=v.dtype)
         else:
             out[0] = gpuarray.empty(sh, dtype=v.dtype)
             out[0][...] = v
     else:
         out[0][...] = v
     if config.gpuarray.sync:
         out[0].sync()
예제 #13
0
 def perform(self, node, inputs, outs):
     out, = outs
     v = inputs[0]
     sh = tuple(map(int, inputs[1:]))
     if out[0] is None or out[0].shape != sh:
         if self.memset_0:
             out[0] = gpuarray.zeros(sh, dtype=v.dtype)
         else:
             out[0] = gpuarray.empty(sh, dtype=v.dtype)
             out[0][...] = v
     else:
         out[0][...] = v
     if config.gpuarray.sync:
         out[0].sync()
예제 #14
0
def reduction_op(op, dtype, axis):
    c, g = gen_gpuarray((2, 3), dtype=dtype, ctx=context, cls=elemary)

    rc = getattr(c, op)(axis=axis)
    rg = getattr(g, op)(axis=axis)

    check_meta_content(rg, rc)

    outc = numpy.empty(rc.shape, dtype=rc.dtype)
    outg = gpuarray.empty(rg.shape, dtype=rg.dtype, context=context)

    rc = getattr(c, op)(axis=axis, out=outc)
    rg = getattr(g, op)(axis=axis, out=outg)

    check_meta_content(outg, outc)
예제 #15
0
def reduction_op(op, dtype, axis):
    c, g = gen_gpuarray((2, 3), dtype=dtype, ctx=context, cls=elemary)

    rc = getattr(c, op)(axis=axis)
    rg = getattr(g, op)(axis=axis)

    check_meta_content(rg, rc)

    outc = numpy.empty(rc.shape, dtype=rc.dtype)
    outg = gpuarray.empty(rg.shape, dtype=rg.dtype, context=context)

    rc = getattr(c, op)(axis=axis, out=outc)
    rg = getattr(g, op)(axis=axis, out=outg)

    check_meta_content(outg, outc)
예제 #16
0
 def check_elemwise2(self, shapea, shapeb, output_shape, broadcast=True):
     # We rewrite this version of elemwise2 to skip the scaling of output
     # that is done in the official elemwise2 function.
     na, ga = gen_gpuarray(shapea, ctx=context, cls=elemary)
     nb, gb = gen_gpuarray(shapeb, ctx=context, cls=elemary)
     odtype = get_common_dtype(ga, gb, True)
     res = gpuarray.empty(output_shape, dtype=odtype, context=ga.context, cls=ga.__class__)
     a_arg = as_argument(ga, 'a', read=True)
     b_arg = as_argument(gb, 'b', read=True)
     res_arg = as_argument(res, 'res', write=True)
     args = [res_arg, a_arg, b_arg]
     oper = "res = (%(out_t)s)a %(op)s (%(out_t)s)b" % {'op': '+', 'out_t': dtype_to_ctype(odtype)}
     k = GpuElemwise(ga.context, oper, args, convert_f16=True)
     k(res, ga, gb, broadcast=broadcast)
     nres = na + nb
     assert numpy.allclose(nres, numpy.asarray(res), atol=1e-6)
예제 #17
0
 def check_elemwise2(self, shapea, shapeb, output_shape, broadcast=True):
     # We rewrite this version of elemwise2 to skip the scaling of output
     # that is done in the official elemwise2 function.
     na, ga = gen_gpuarray(shapea, ctx=context, cls=elemary)
     nb, gb = gen_gpuarray(shapeb, ctx=context, cls=elemary)
     odtype = get_common_dtype(ga, gb, True)
     res = gpuarray.empty(output_shape,
                          dtype=odtype,
                          context=ga.context,
                          cls=ga.__class__)
     a_arg = as_argument(ga, 'a', read=True)
     b_arg = as_argument(gb, 'b', read=True)
     res_arg = as_argument(res, 'res', write=True)
     args = [res_arg, a_arg, b_arg]
     oper = "res = (%(out_t)s)a %(op)s (%(out_t)s)b" % {
         'op': '+',
         'out_t': dtype_to_ctype(odtype)
     }
     k = GpuElemwise(ga.context, oper, args, convert_f16=True)
     k(res, ga, gb, broadcast=broadcast)
     nres = na + nb
     assert numpy.allclose(nres, numpy.asarray(res), atol=1e-6)
예제 #18
0
    def test_reduce_scatter(self):
        texp = self.size * np.arange(5 * self.size) + sum(range(self.size))
        exp = texp[self.rank * 5:self.rank * 5 + 5]

        # order c
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (self.size, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = gpuarray.empty((5, ),
                                dtype='int64',
                                order='C',
                                context=self.ctx)

        self.gpucomm.reduce_scatter(gpu, 'sum', resgpu)
        assert np.allclose(resgpu, exp)

        # order f
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (5, self.size), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = gpuarray.empty((5, ),
                                dtype='int64',
                                order='F',
                                context=self.ctx)

        self.gpucomm.reduce_scatter(gpu, 'sum', resgpu)
        assert np.allclose(resgpu, exp)

        # make result order c (one less dim)
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (self.size, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['C_CONTIGUOUS'] is True

        # c-contiguous split problem (for size == 1, it can always be split)
        if self.size != 1:
            cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank
            np.reshape(cpu, (self.size + 1, 5), order='C')
            gpu = gpuarray.asarray(cpu, context=self.ctx)
            with self.assertRaises(TypeError):
                resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')

        # make result order f (one less dim)
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (5, self.size), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['F_CONTIGUOUS'] is True

        # f-contiguous split problem (for size == 1, it can always be split)
        if self.size != 1:
            cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank
            np.reshape(cpu, (5, self.size + 1), order='F')
            gpu = gpuarray.asarray(cpu, context=self.ctx)
            with self.assertRaises(TypeError):
                resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')

        # make result order c (same dim - less size)
        texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size))
        exp = texp[self.rank * 15:self.rank * 15 + 15]
        np.reshape(exp, (3, 5), order='C')
        cpu = np.arange(5 * self.size * 3) + self.rank
        np.reshape(cpu, (self.size * 3, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['C_CONTIGUOUS'] is True

        # make result order f (same dim - less size)
        texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size))
        exp = texp[self.rank * 15:self.rank * 15 + 15]
        np.reshape(exp, (5, 3), order='F')
        cpu = np.arange(5 * self.size * 3) + self.rank
        np.reshape(cpu, (5, self.size * 3), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['F_CONTIGUOUS'] is True
예제 #19
0
def gpu_alloc_expected(x, *shp):
    g = gpuarray.empty(shp, dtype=x.dtype, context=get_context(test_ctx_name))
    g[:] = x
    return g
예제 #20
0
def empty(shp, order, dtype):
    x = gpu_ndarray.empty(shp, dtype, order, context=ctx)
    y = numpy.empty(shp, dtype, order)
    check_meta(x, y)
예제 #21
0
파일: gpu_ufuncs.py 프로젝트: kohr-h/misc
def ufunc21(name, a, b, out=None, context=None):
    """Call a ufunc with 2 inputs and 1 output.

    Parameters
    ----------
    name : str
        Name of the NumPy ufunc.
    a, b : `array-like`
        Input arrays to which the ufunc should be applied.
    out : `pygpu.gpuarray.GpuArray`, optional
        Array in which to store the result.
    context : `pygpu.gpuarray.GpuContext`, optional
        Use this GPU context to evaluate the GPU kernel. For ``None``,
        if no GPU array is among the provided parameters, a default
        GPU context must have been set.

    Returns
    -------
    out : `pygpu.gpuarray.GpuArray`
        Result of the computation. If ``out`` was given, the returned
        object is a reference to it.
        The type of the returned array is `pygpu._array.ndgpuarray` if

        - no GPU array was among the parameters or
        - one of the parameters had type `pygpu._array.ndgpuarray`.
    """
    # Lazy import to avoid circular dependency
    from pygpu._array import ndgpuarray

    # --- Prepare input array --- #

    # Determine GPU context and class. Use the "highest" class present in the
    # inputs, defaulting to `ndgpuarray`
    need_context = True
    cls = None
    for ary in (a, b, out):
        if isinstance(ary, GpuArray):
            if context is not None and ary.context != context:
                raise ValueError('cannot mix contexts')
            context = ary.context
            if cls is None or cls == GpuArray:
                cls = ary.__class__
            need_context = False

    if need_context and context is None:
        context = get_default_context()
        cls = ndgpuarray

    # Cast input to `GpuArray` of the right dtype if necessary
    # TODO: figure out what to do here exactly (scalars and such)
    if isinstance(a, (GpuArray, numpy.ndarray)):
        if a.flags.f_contiguous and not a.flags.c_contiguous:
            order = 'F'
        else:
            order = 'C'

        # Determine signature here to avoid creating an intermediate GPU array
        sig = find_smallest_valid_signature(name, (a, ), (out, ))
        if not sig:
            raise TypeError('ufunc {!r} not supported for the input types, '
                            'and the inputs could not be safely coerced'
                            ''.format(name))

        tc_in, _ = sig.split('->')
        a = array(a,
                  dtype=tc_in,
                  copy=False,
                  order=order,
                  context=context,
                  cls=cls)
    else:
        a = array(a, context=context, cls=cls)

        sig = find_smallest_valid_signature(name, (a, ), (out, ))
        if not sig:
            raise TypeError('ufunc {!r} not supported for the input types, '
                            'and the inputs could not be safely coerced'
                            ''.format(name))

        # Upcast input if necessary
        tc_in, tc_out = sig.split('->')
        if a.dtype < tc_in:
            a = a.astype(tc_in)

    # Create output array if not provided
    if out is None:
        out = empty(a.shape, dtype=tc_out, context=context, cls=cls)

    # --- Generate code strings for GpuElemwise --- #

    # C dtypes for casting
    c_dtype_in = dtype_to_ctype(tc_in)
    c_dtype_out = dtype_to_ctype(tc_out)

    meta = ufunc_metadata[name]
    assert meta['nin'] == 1
    assert meta['nout'] == 1

    # Create `oper` string
    if meta['c_op'] is not None:
        # Case 1: unary operator
        unop = meta['c_op']
        if a.dtype == numpy.bool and unop == '-':
            if parse_version(numpy.__version__) >= parse_version('1.13'):
                # Numpy >= 1.13 raises a TypeError
                raise TypeError(
                    'negation of boolean arrays is not supported, use '
                    '`logical_not` instead')
            else:
                # Warn and remap to logical not
                warnings.warn(
                    'using negation (`-`) with boolean arrays is '
                    'deprecated, use `logical_not` (`~`) instead; '
                    'the current behavior will be changed along '
                    "with NumPy's", FutureWarning)
                unop = '!'
        oper = 'out = ({odt}) {}a'.format(unop, odt=c_dtype_out)
        preamble = ''

    elif meta['c_func'] is not None:
        # Case 2: C function
        c_func = meta['c_func']

        if name in ('abs', 'absolute'):
            # Special case
            if numpy.dtype(tc_out).kind == 'u':
                # Shortcut for abs() with unsigned int. This also fixes a CUDA
                # quirk that makes abs() crash with unsigned int input.
                out[:] = a
                return out
            elif numpy.dtype(tc_out).kind == 'f':
                c_func = 'fabs'
            else:
                c_func = 'abs'

        oper = 'out = ({odt}) {}(a)'.format(c_func, odt=c_dtype_out)
        preamble_tpl = mako.template.Template(meta['oper_preamble_tpl'])
        preamble = preamble_tpl.render(idt=c_dtype_in, odt=c_dtype_out)

    elif meta['oper_fmt'] is not None:
        # Case 3: custom implementation with `oper` template
        oper = meta['oper_fmt'].format(idt=c_dtype_in, odt=c_dtype_out)
        preamble_tpl = mako.template.Template(meta['oper_preamble_tpl'])
        preamble = preamble_tpl.render(idt=c_dtype_in, odt=c_dtype_out)

    else:
        # Case 4: not implemented
        raise NotImplementedError('ufunc {!r} not implemented'.format(name))

    # --- Generate and run GpuElemwise kernel --- #

    a_arg = as_argument(a, 'a', read=True)
    args = [arg('out', out.dtype, write=True), a_arg]

    ker = GpuElemwise(context, oper, args, preamble=preamble)
    ker(out, a)
    return out
예제 #22
0
def test_empty_no_dtype():
    x = gpu_ndarray.empty((), context=ctx)# no dtype and order param
    y = numpy.empty(())
    check_meta(x, y)
예제 #23
0
def gpu_alloc_expected(x, *shp):
    g = gpuarray.empty(shp, dtype=x.dtype, context=get_context(test_ctx_name))
    g[:] = x
    return g
예제 #24
0
def test_empty_no_params():
    try:
        gpu_ndarray.empty()
        assert False
    except TypeError:
        pass
예제 #25
0
def test_empty_no_params():
    try:
        gpu_ndarray.empty()
        assert False
    except TypeError:
        pass
예제 #26
0
def empty(shp, order, dtype):
    x = gpu_ndarray.empty(shp, dtype, order, context=ctx)
    y = numpy.empty(shp, dtype, order)
    check_meta(x, y)
예제 #27
0
def test_empty_no_dtype():
    x = gpu_ndarray.empty((), context=ctx)  # no dtype and order param
    y = numpy.empty(())
    check_meta(x, y)
예제 #28
0
def elemwise_collapse(dtype1, dtype2, shape1, shape2, expected):
    assert len(shape1) == len(shape2)

    # int8 does not cause problematic upcasts
    scalar = numpy.asarray(1, dtype='int8')

    a_cpu, a_gpu = gen_gpuarray(shape1, dtype1, ctx=context)
    b_cpu, b_gpu = gen_gpuarray(shape2, dtype2, ctx=context)

    o_shape = []
    for i in range(len(shape1)):
        o_shape.append(max(shape1[i], shape2[i]))

    o = gpuarray.empty(o_shape, dtype=(a_cpu + b_cpu).dtype, context=context)

    n, nd, dims, strs, offsets, contig = check_args((a_gpu, b_gpu),
                                                    collapse=True,
                                                    broadcast=True)

    assert nd == expected, (shape1, shape2, dims, nd, expected)

    k = ElemwiseKernel(context, [ArrayArg(numpy.dtype(dtype1), 'a'),
                                 ArrayArg(numpy.dtype(dtype2), 'b'),
                                 ArrayArg(o.dtype, 'o')], "o[i] = a[i] + b[i]")
    out_cpu = a_cpu + b_cpu
    k(a_gpu, b_gpu, o, collapse=True, broadcast=True)

    assert numpy.allclose(numpy.asarray(o), out_cpu)

    k(a_gpu, b_gpu, o, collapse=False, broadcast=True)

    assert numpy.allclose(numpy.asarray(o), out_cpu)

    broadcast = any([True for i in shape1 + shape2
                     if i == 1])

    n, nd, dims, strs, offsets, contig = check_args((a_gpu, b_gpu, scalar),
                                                    collapse=True,
                                                    broadcast=True)
    assert nd == expected

    k = ElemwiseKernel(context, [ArrayArg(numpy.dtype(dtype1), 'a'),
                                 ArrayArg(numpy.dtype(dtype2), 'b'),
                                 ScalarArg(scalar.dtype, 's'),
                                 ArrayArg(o.dtype, 'o')],
                       "o[i] = a[i] + b[i] + s")
    out_cpu = a_cpu + b_cpu + scalar
    k(a_gpu, b_gpu, scalar, o, collapse=True, broadcast=True)

    assert numpy.allclose(numpy.asarray(o), out_cpu)

    k(a_gpu, b_gpu, scalar, o, collapse=False, broadcast=True)

    assert numpy.allclose(numpy.asarray(o), out_cpu)

    if expected == 1:
        expected2 = 2
    else:
        expected2 = expected

    if len(shape1) != 4:
        return

    if shape1[0] != 1:
        c_cpu, c_gpu = gen_gpuarray(shape1, dtype=dtype1, sliced=2, ctx=context)
        n, nd, dims, strs, offsets,contig = check_args((c_gpu, b_gpu),
                                                       collapse=True,
                                                       broadcast=True)
        if broadcast:
            assert nd >= expected
        else:
            assert nd == expected2
예제 #29
0
    def test_reduce_scatter(self):
        texp = self.size * np.arange(5 * self.size) + sum(range(self.size))
        exp = texp[self.rank * 5:self.rank * 5 + 5]

        # order c
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (self.size, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = gpuarray.empty((5,), dtype='int64', order='C', context=self.ctx)

        self.gpucomm.reduce_scatter(gpu, 'sum', resgpu)
        assert np.allclose(resgpu, exp)

        # order f
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (5, self.size), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = gpuarray.empty((5,), dtype='int64', order='F', context=self.ctx)

        self.gpucomm.reduce_scatter(gpu, 'sum', resgpu)
        assert np.allclose(resgpu, exp)

        # make result order c (one less dim)
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (self.size, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['C_CONTIGUOUS'] is True

        # c-contiguous split problem (for size == 1, it can always be split)
        if self.size != 1:
            cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank
            np.reshape(cpu, (self.size + 1, 5), order='C')
            gpu = gpuarray.asarray(cpu, context=self.ctx)
            with self.assertRaises(TypeError):
                resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')

        # make result order f (one less dim)
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (5, self.size), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['F_CONTIGUOUS'] is True

        # f-contiguous split problem (for size == 1, it can always be split)
        if self.size != 1:
            cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank
            np.reshape(cpu, (5, self.size + 1), order='F')
            gpu = gpuarray.asarray(cpu, context=self.ctx)
            with self.assertRaises(TypeError):
                resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')

        # make result order c (same dim - less size)
        texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size))
        exp = texp[self.rank * 15:self.rank * 15 + 15]
        np.reshape(exp, (3, 5), order='C')
        cpu = np.arange(5 * self.size * 3) + self.rank
        np.reshape(cpu, (self.size * 3, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['C_CONTIGUOUS'] is True

        # make result order f (same dim - less size)
        texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size))
        exp = texp[self.rank * 15:self.rank * 15 + 15]
        np.reshape(exp, (5, 3), order='F')
        cpu = np.arange(5 * self.size * 3) + self.rank
        np.reshape(cpu, (5, self.size * 3), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['F_CONTIGUOUS'] is True