Пример #1
0
def profile(shape=(1000, 1000), dtype='float64', rng=(-1, 1)):
    print("\n### Profiling worker")
    print()
    print("### shape =", shape)
    print("### dtype =", dtype)
    print("### range =", sorted(rng))

    rang = abs(rng[1] - rng[0])
    inp = np.random.random(shape) * rang + min(rng)
    inp = inp.astype(dtype)
    sinp = gpuarray.asarray(inp, context=worker.gpuctx)
    out = np.empty_like(inp)
    sout = gpuarray.asarray(out, context=worker.gpuctx)

    print("\n### Profiling worker.all_reduce")
    print("## First call to worker.all_reduce")
    cProfile.runctx("worker.all_reduce(sinp, '+', sout)", globals(), locals(),
                    filename="worker.prof")
    s = pstats.Stats("worker.prof")
    s.strip_dirs().sort_stats("time").print_stats()
    assert_allclose(inp * worker.global_size, np.asarray(sout))

    print("## Second call to worker.all_reduce")
    cProfile.runctx("worker.all_reduce(sinp, '+', sout)", globals(), locals(),
                    filename="worker.prof")
    s = pstats.Stats("worker.prof")
    s.strip_dirs().sort_stats("time").print_stats()
    assert_allclose(inp * worker.global_size, np.asarray(sout))
    if worker._multinode:
        print("## Note that there must be difference between the first and")
        print("## the second call as a result of the extra call to worker.shared")
        print("## during the first time.")
Пример #2
0
    def test_linked_shared(self):
        inp = np.arange(32, dtype='float64')
        sinp = gpuarray.asarray(inp, context=self.ctx)
        insize = sinp.size * sinp.itemsize
        out = np.empty_like(inp)
        sout = gpuarray.asarray(out, context=self.ctx)
        outsize = sout.size * sout.itemsize

        if self.worker._multinode:
            try:
                self.worker.shared_arrays[outsize]
                self.fail(
                    "'sout''s size has not been linked yet to a shared buffer")
            except KeyError:
                pass
            try:
                self.worker.shared_arrays[insize]
                self.fail(
                    "'sinp''s size has not been linked yet to a shared buffer")
            except KeyError:
                pass

        self.worker.all_reduce(sinp, '+', sout)

        if self.worker._multinode:
            try:
                self.worker.shared_arrays[outsize]
            except KeyError:
                self.fail(
                    "`sout`'s size should have been linked to a shared buffer")
            try:
                self.worker.shared_arrays[insize]
            except KeyError:
                self.fail(
                    "`sinp`'s size should have been linked to a shared buffer")

        expected = self.total_nw * inp
        actual = np.asarray(sout)
        assert np.allclose(expected, actual)

        self.worker.all_reduce(sout, '*', sout)

        if self.worker._multinode:
            try:
                self.worker.shared_arrays[outsize]
            except KeyError:
                self.fail(
                    "`sout`'s size should have been linked to a shared buffer")
            try:
                self.worker.shared_arrays[insize]
            except KeyError:
                self.fail(
                    "`sinp`'s size should have been linked to a shared buffer")

        expected = expected**self.total_nw
        actual = np.asarray(sout)
        assert np.allclose(expected, actual)
Пример #3
0
 def test_interface1(self):
     inp = np.arange(32, dtype='float64')
     sinp = gpuarray.asarray(inp, context=self.ctx)
     out = np.empty_like(inp)
     sout = gpuarray.asarray(out, context=self.ctx)
     self.worker.all_reduce(sinp, '+', sout)
     expected = self.total_nw * inp
     actual = np.asarray(sout)
     assert np.allclose(expected, actual)
Пример #4
0
 def test_interface1(self):
     inp = np.arange(32, dtype='float64')
     sinp = gpuarray.asarray(inp, context=self.ctx)
     out = np.empty_like(inp)
     sout = gpuarray.asarray(out, context=self.ctx)
     self.worker.all_reduce(sinp, '+', sout)
     expected = self.total_nw * inp
     actual = np.asarray(sout)
     assert np.allclose(expected, actual)
Пример #5
0
    def test_linked_shared(self):
        inp = np.arange(32, dtype='float64')
        sinp = gpuarray.asarray(inp, context=self.ctx)
        insize = sinp.size * sinp.itemsize
        out = np.empty_like(inp)
        sout = gpuarray.asarray(out, context=self.ctx)
        outsize = sout.size * sout.itemsize

        if self.worker._multinode:
            try:
                self.worker.shared_arrays[outsize]
                self.fail("'sout''s size has not been linked yet to a shared buffer")
            except KeyError:
                pass
            try:
                self.worker.shared_arrays[insize]
                self.fail("'sinp''s size has not been linked yet to a shared buffer")
            except KeyError:
                pass

        self.worker.all_reduce(sinp, '+', sout)

        if self.worker._multinode:
            try:
                self.worker.shared_arrays[outsize]
            except KeyError:
                self.fail("`sout`'s size should have been linked to a shared buffer")
            try:
                self.worker.shared_arrays[insize]
            except KeyError:
                self.fail("`sinp`'s size should have been linked to a shared buffer")

        expected = self.total_nw * inp
        actual = np.asarray(sout)
        assert np.allclose(expected, actual)

        self.worker.all_reduce(sout, '*', sout)

        if self.worker._multinode:
            try:
                self.worker.shared_arrays[outsize]
            except KeyError:
                self.fail("`sout`'s size should have been linked to a shared buffer")
            try:
                self.worker.shared_arrays[insize]
            except KeyError:
                self.fail("`sinp`'s size should have been linked to a shared buffer")

        expected = expected ** self.total_nw
        actual = np.asarray(sout)
        assert np.allclose(expected, actual)
Пример #6
0
def test_hostfromgpu_shape_i():
    """
    Test that the shape is lifted over hostfromgpu
    """

    m = mode_with_gpu.including('local_dot_to_dot22',
                                'local_dot22_to_dot22scalar', 'specialize')
    a = T.fmatrix('a')
    ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))()
    av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
    cv = gpuarray.asarray(numpy.random.rand(5, 4),
                          dtype='float32',
                          context=get_context(test_ctx_name))

    f = theano.function([a], GpuFromHost(test_ctx_name)(a), mode=m)
    assert any(
        isinstance(x.op, GpuFromHost) for x in f.maker.fgraph.toposort())
    f = theano.function([a], GpuFromHost(test_ctx_name)(a).shape, mode=m)
    topo = f.maker.fgraph.toposort()
    assert isinstance(topo[0].op, T.opt.Shape_i)
    assert isinstance(topo[1].op, T.opt.Shape_i)
    assert isinstance(topo[2].op, T.opt.MakeVector)
    assert tuple(f(av)) == (5, 4)

    f = theano.function([ca], host_from_gpu(ca), mode=m)
    assert host_from_gpu in [x.op for x in f.maker.fgraph.toposort()]
    f = theano.function([ca], host_from_gpu(ca).shape, mode=m)
    topo = f.maker.fgraph.toposort()
    assert isinstance(topo[0].op, theano.compile.Shape_i)
    assert isinstance(topo[1].op, theano.compile.Shape_i)
    assert isinstance(topo[2].op, theano.tensor.opt.MakeVector)
    assert tuple(f(cv)) == (5, 4)
Пример #7
0
def test_hostfromgpu_shape_i():
    # Test that the shape is lifted over hostfromgpu

    m = mode_with_gpu.including('local_dot_to_dot22',
                                'local_dot22_to_dot22scalar',
                                'specialize')
    a = T.fmatrix('a')
    ca = theano.gpuarray.type.GpuArrayType('float32', (False, False))()
    av = np.asarray(np.random.rand(5, 4), dtype='float32')
    cv = gpuarray.asarray(np.random.rand(5, 4),
                          dtype='float32',
                          context=get_context(test_ctx_name))

    f = theano.function([a], GpuFromHost(test_ctx_name)(a), mode=m)
    assert any(isinstance(x.op, GpuFromHost)
               for x in f.maker.fgraph.toposort())
    f = theano.function([a], GpuFromHost(test_ctx_name)(a).shape, mode=m)
    topo = f.maker.fgraph.toposort()
    assert isinstance(topo[0].op, T.opt.Shape_i)
    assert isinstance(topo[1].op, T.opt.Shape_i)
    assert isinstance(topo[2].op, T.opt.MakeVector)
    assert tuple(f(av)) == (5, 4)

    f = theano.function([ca], host_from_gpu(ca), mode=m)
    assert host_from_gpu in [x.op
                             for x in f.maker.fgraph.toposort()]
    f = theano.function([ca], host_from_gpu(ca).shape, mode=m)
    topo = f.maker.fgraph.toposort()
    assert isinstance(topo[0].op, theano.compile.Shape_i)
    assert isinstance(topo[1].op, theano.compile.Shape_i)
    assert isinstance(topo[2].op, theano.tensor.opt.MakeVector)
    assert tuple(f(cv)) == (5, 4)
Пример #8
0
    def test_broadcast(self):
        if self.rank == 0:
            cpu, gpu = gen_gpuarray((3, 4, 5), order='c', incr=self.rank, ctx=self.ctx)
        else:
            cpu = np.zeros((3, 4, 5), dtype='float32')
            gpu = gpuarray.asarray(cpu, context=self.ctx)

        if self.rank == 0:
            self.gpucomm.broadcast(gpu)
        else:
            self.gpucomm.broadcast(gpu, root=0)
        self.mpicomm.Bcast(cpu, root=0)
        assert np.allclose(gpu, cpu)
Пример #9
0
def profile(shape=(1000, 1000), dtype='float64', rng=(-1, 1)):
    print("\n### Profiling worker")
    print()
    print("### shape =", shape)
    print("### dtype =", dtype)
    print("### range =", sorted(rng))

    rang = abs(rng[1] - rng[0])
    inp = np.random.random(shape) * rang + min(rng)
    inp = inp.astype(dtype)
    sinp = gpuarray.asarray(inp, context=worker.gpuctx)
    out = np.empty_like(inp)
    sout = gpuarray.asarray(out, context=worker.gpuctx)

    print("\n### Profiling worker.all_reduce")
    print("## First call to worker.all_reduce")
    cProfile.runctx("worker.all_reduce(sinp, '+', sout)",
                    globals(),
                    locals(),
                    filename="worker.prof")
    s = pstats.Stats("worker.prof")
    s.strip_dirs().sort_stats("time").print_stats()
    assert_allclose(inp * worker.global_size, np.asarray(sout))

    print("## Second call to worker.all_reduce")
    cProfile.runctx("worker.all_reduce(sinp, '+', sout)",
                    globals(),
                    locals(),
                    filename="worker.prof")
    s = pstats.Stats("worker.prof")
    s.strip_dirs().sort_stats("time").print_stats()
    assert_allclose(inp * worker.global_size, np.asarray(sout))
    if worker._multinode:
        print("## Note that there must be difference between the first and")
        print(
            "## the second call as a result of the extra call to worker.shared"
        )
        print("## during the first time.")
Пример #10
0
def benchmark(shape=(1000, 1000), dtype='float64', rng=(-1, 1), number=10):
    print("\n### Benchmarking worker")
    print()
    print("### shape =", shape)
    print("### dtype =", dtype)
    print("### range =", sorted(rng))
    print("### num of iterations =", number)

    rang = abs(rng[1] - rng[0])
    inp = np.random.random(shape) * rang + min(rng)
    inp = inp.astype(dtype)
    sinp = gpuarray.asarray(inp, context=worker.gpuctx)
    out = np.empty_like(inp)
    sout = gpuarray.asarray(out, context=worker.gpuctx)

    print("\n## Benchmarking worker.shared")
    print("# First call")
    start = timer()
    worker.shared(sinp)
    end = timer()
    print("Time:", end - start)
    print("# Second call")
    start = timer()
    worker.shared(sinp)
    end = timer()
    print("Time:", end - start)

    print("\n## Benchmarking worker.all_reduce")
    print("# Timing worker.all_reduce w/o calls to worker.shared")
    ttime = 0
    for _ in range(number):
        start = timer()
        worker.all_reduce(sinp, '+', sout)
        end = timer()
        ttime += end - start
        assert_allclose(inp * worker.global_size, np.asarray(sout))
    print("Mean time:", ttime / number)
Пример #11
0
def benchmark(shape=(1000, 1000), dtype='float64', rng=(-1, 1), number=10):
    print("\n### Benchmarking worker")
    print()
    print("### shape =", shape)
    print("### dtype =", dtype)
    print("### range =", sorted(rng))
    print("### num of iterations =", number)

    rang = abs(rng[1] - rng[0])
    inp = np.random.random(shape) * rang + min(rng)
    inp = inp.astype(dtype)
    sinp = gpuarray.asarray(inp, context=worker.gpuctx)
    out = np.empty_like(inp)
    sout = gpuarray.asarray(out, context=worker.gpuctx)

    print("\n## Benchmarking worker.shared")
    print("# First call")
    start = timer()
    worker.shared(sinp)
    end = timer()
    print("Time:", end - start)
    print("# Second call")
    start = timer()
    worker.shared(sinp)
    end = timer()
    print("Time:", end - start)

    print("\n## Benchmarking worker.all_reduce")
    print("# Timing worker.all_reduce w/o calls to worker.shared")
    ttime = 0
    for _ in range(number):
        start = timer()
        worker.all_reduce(sinp, '+', sout)
        end = timer()
        ttime += end - start
        assert_allclose(inp * worker.global_size, np.asarray(sout))
    print("Mean time:", ttime / number)
Пример #12
0
def as_gpuarray_variable(x, context_name):
    # If this is already some form of variable, try to avoid an extra transfer
    if isinstance(x, Variable):
        while True:
            # If we are already a GpuArrayVariable in the right context
            # then there is nothing to do.
            if (isinstance(x.type, GpuArrayType) and
                    x.type.context_name == context_name):
                return x

            # If x is the result of a transfer, try to dig through.
            if getattr(x, 'owner', None):
                if isinstance(x.owner.op, HostFromGpu):
                    x = x.owner.inputs[0]
                    continue
                if isinstance(x.owner.op, GpuFromHost):
                    x = x.owner.inputs[0]
                    continue
                if isinstance(x.owner.op, GpuToGpu):
                    x = x.owner.inputs[0]
                    continue

            # If none of the conditions where met, then continue with
            # the rest of the body
            break

        # If we couldn't deal with transfers, then maybe it's a tensor
        if isinstance(x.type, tensor.TensorType):
            return GpuFromHost(context_name)(x)

    # Try _as_GpuArrayVariable if possible
    if hasattr(x, '_as_GpuArrayVariable'):
        return x._as_GpuArrayVariable(context_name)

    # If it didn't work try for a constant
    ctx = get_context(context_name)

    if isinstance(x, gpuarray.GpuArray):
        if x.context.ptr != ctx.ptr:
            x = x.transfer(ctx)

    x = gpuarray.asarray(x, context=ctx)

    bcast = [(s == 1) for s in x.shape]
    return GpuArrayConstant(GpuArrayType(dtype=x.dtype,
                                         broadcastable=bcast,
                                         context_name=context_name),
                            x)
Пример #13
0
def as_gpuarray_variable(x, context_name):
    # If this is already some form of variable, try to avoid an extra transfer
    if isinstance(x, Variable):
        while True:
            # If we are already a GpuArrayVariable in the right context
            # then there is nothing to do.
            if (isinstance(x.type, GpuArrayType)
                    and x.type.context_name == context_name):
                return x

            # If x is the result of a transfer, try to dig through.
            if getattr(x, 'owner', None):
                if isinstance(x.owner.op, HostFromGpu):
                    x = x.owner.inputs[0]
                    continue
                if isinstance(x.owner.op, GpuFromHost):
                    x = x.owner.inputs[0]
                    continue
                if isinstance(x.owner.op, GpuToGpu):
                    x = x.owner.inputs[0]
                    continue

            # If none of the conditions where met, then continue with
            # the rest of the body
            break

        # If we couldn't deal with transfers, then maybe it's a tensor
        if isinstance(x.type, tensor.TensorType):
            return GpuFromHost(context_name)(x)

    # Try _as_GpuArrayVariable if possible
    if hasattr(x, '_as_GpuArrayVariable'):
        return x._as_GpuArrayVariable(context_name)

    # If it didn't work try for a constant
    ctx = get_context(context_name)

    if isinstance(x, gpuarray.GpuArray):
        if x.context.ptr != ctx.ptr:
            x = x.transfer(ctx)

    x = gpuarray.asarray(x, context=ctx)

    bcast = [(s == 1) for s in x.shape]
    return GpuArrayConstant(
        GpuArrayType(dtype=x.dtype,
                     broadcastable=bcast,
                     context_name=context_name), x)
Пример #14
0
    def test_broadcast(self):
        if self.rank == 0:
            cpu, gpu = gen_gpuarray((3, 4, 5),
                                    order='c',
                                    incr=self.rank,
                                    ctx=self.ctx)
        else:
            cpu = np.zeros((3, 4, 5), dtype='float32')
            gpu = gpuarray.asarray(cpu, context=self.ctx)

        if self.rank == 0:
            self.gpucomm.broadcast(gpu)
        else:
            self.gpucomm.broadcast(gpu, root=0)
        self.mpicomm.Bcast(cpu, root=0)
        assert np.allclose(gpu, cpu)
Пример #15
0
    def test_all_gather(self):
        texp = np.arange(self.size * 10, dtype='int32')
        cpu = np.arange(self.rank * 10, self.rank * 10 + 10, dtype='int32')

        a = cpu
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
        check_all(resgpu, texp)

        a = cpu.reshape((2, 5), order='C')
        exp = texp.reshape((2 * self.size, 5), order='C')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
        check_all(resgpu, exp)

        a = cpu.reshape((2, 5), order='C')
        exp = texp.reshape((self.size, 2, 5), order='C')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=1)
        check_all(resgpu, exp)

        a = cpu.reshape((2, 5), order='C')
        exp = texp.reshape((self.size, 1, 1, 2, 5), order='C')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=3)
        check_all(resgpu, exp)

        a = cpu.reshape((5, 2), order='F')
        exp = texp.reshape((5, 2 * self.size), order='F')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
        check_all(resgpu, exp)

        a = cpu.reshape((5, 2), order='F')
        exp = texp.reshape((5, 2, self.size), order='F')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=1)
        check_all(resgpu, exp)

        a = cpu.reshape((5, 2), order='F')
        exp = texp.reshape((5, 2, 1, 1, self.size), order='F')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=3)
        check_all(resgpu, exp)

        with self.assertRaises(Exception):
            resgpu = self.gpucomm.all_gather(gpu, nd_up=-2)
Пример #16
0
    def test_all_gather(self):
        texp = np.arange(self.size * 10, dtype='int32')
        cpu = np.arange(self.rank * 10, self.rank * 10 + 10, dtype='int32')

        a = cpu
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
        check_all(resgpu, texp)

        a = cpu.reshape((2, 5), order='C')
        exp = texp.reshape((2 * self.size, 5), order='C')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
        check_all(resgpu, exp)

        a = cpu.reshape((2, 5), order='C')
        exp = texp.reshape((self.size, 2, 5), order='C')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=1)
        check_all(resgpu, exp)

        a = cpu.reshape((2, 5), order='C')
        exp = texp.reshape((self.size, 1, 1, 2, 5), order='C')
        gpu = gpuarray.asarray(a, context=self.ctx)
        resgpu = self.gpucomm.all_gather(gpu, nd_up=3)
        check_all(resgpu, exp)

        a = cpu.reshape((5, 2), order='F')
        exp = texp.reshape((5, 2 * self.size), order='F')
        gpu = gpuarray.asarray(a, context=self.ctx, order='F')
        resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
        check_all(resgpu, exp)

        a = cpu.reshape((5, 2), order='F')
        exp = texp.reshape((5, 2, self.size), order='F')
        gpu = gpuarray.asarray(a, context=self.ctx, order='F')
        resgpu = self.gpucomm.all_gather(gpu, nd_up=1)
        check_all(resgpu, exp)

        a = cpu.reshape((5, 2), order='F')
        exp = texp.reshape((5, 2, 1, 1, self.size), order='F')
        gpu = gpuarray.asarray(a, context=self.ctx, order='F')
        resgpu = self.gpucomm.all_gather(gpu, nd_up=3)
        check_all(resgpu, exp)

        with self.assertRaises(Exception):
            resgpu = self.gpucomm.all_gather(gpu, nd_up=-2)
Пример #17
0
    def test_reduce_scatter(self):
        texp = self.size * np.arange(5 * self.size) + sum(range(self.size))
        exp = texp[self.rank * 5:self.rank * 5 + 5]

        # order c
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (self.size, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = gpuarray.empty((5, ),
                                dtype='int64',
                                order='C',
                                context=self.ctx)

        self.gpucomm.reduce_scatter(gpu, 'sum', resgpu)
        assert np.allclose(resgpu, exp)

        # order f
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (5, self.size), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = gpuarray.empty((5, ),
                                dtype='int64',
                                order='F',
                                context=self.ctx)

        self.gpucomm.reduce_scatter(gpu, 'sum', resgpu)
        assert np.allclose(resgpu, exp)

        # make result order c (one less dim)
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (self.size, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['C_CONTIGUOUS'] is True

        # c-contiguous split problem (for size == 1, it can always be split)
        if self.size != 1:
            cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank
            np.reshape(cpu, (self.size + 1, 5), order='C')
            gpu = gpuarray.asarray(cpu, context=self.ctx)
            with self.assertRaises(TypeError):
                resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')

        # make result order f (one less dim)
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (5, self.size), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['F_CONTIGUOUS'] is True

        # f-contiguous split problem (for size == 1, it can always be split)
        if self.size != 1:
            cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank
            np.reshape(cpu, (5, self.size + 1), order='F')
            gpu = gpuarray.asarray(cpu, context=self.ctx)
            with self.assertRaises(TypeError):
                resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')

        # make result order c (same dim - less size)
        texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size))
        exp = texp[self.rank * 15:self.rank * 15 + 15]
        np.reshape(exp, (3, 5), order='C')
        cpu = np.arange(5 * self.size * 3) + self.rank
        np.reshape(cpu, (self.size * 3, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['C_CONTIGUOUS'] is True

        # make result order f (same dim - less size)
        texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size))
        exp = texp[self.rank * 15:self.rank * 15 + 15]
        np.reshape(exp, (5, 3), order='F')
        cpu = np.arange(5 * self.size * 3) + self.rank
        np.reshape(cpu, (5, self.size * 3), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['F_CONTIGUOUS'] is True
Пример #18
0
    def test_reduce_scatter(self):
        texp = self.size * np.arange(5 * self.size) + sum(range(self.size))
        exp = texp[self.rank * 5:self.rank * 5 + 5]

        # order c
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (self.size, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = gpuarray.empty((5,), dtype='int64', order='C', context=self.ctx)

        self.gpucomm.reduce_scatter(gpu, 'sum', resgpu)
        assert np.allclose(resgpu, exp)

        # order f
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (5, self.size), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = gpuarray.empty((5,), dtype='int64', order='F', context=self.ctx)

        self.gpucomm.reduce_scatter(gpu, 'sum', resgpu)
        assert np.allclose(resgpu, exp)

        # make result order c (one less dim)
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (self.size, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['C_CONTIGUOUS'] is True

        # c-contiguous split problem (for size == 1, it can always be split)
        if self.size != 1:
            cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank
            np.reshape(cpu, (self.size + 1, 5), order='C')
            gpu = gpuarray.asarray(cpu, context=self.ctx)
            with self.assertRaises(TypeError):
                resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')

        # make result order f (one less dim)
        cpu = np.arange(5 * self.size) + self.rank
        np.reshape(cpu, (5, self.size), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['F_CONTIGUOUS'] is True

        # f-contiguous split problem (for size == 1, it can always be split)
        if self.size != 1:
            cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank
            np.reshape(cpu, (5, self.size + 1), order='F')
            gpu = gpuarray.asarray(cpu, context=self.ctx)
            with self.assertRaises(TypeError):
                resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')

        # make result order c (same dim - less size)
        texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size))
        exp = texp[self.rank * 15:self.rank * 15 + 15]
        np.reshape(exp, (3, 5), order='C')
        cpu = np.arange(5 * self.size * 3) + self.rank
        np.reshape(cpu, (self.size * 3, 5), order='C')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['C_CONTIGUOUS'] is True

        # make result order f (same dim - less size)
        texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size))
        exp = texp[self.rank * 15:self.rank * 15 + 15]
        np.reshape(exp, (5, 3), order='F')
        cpu = np.arange(5 * self.size * 3) + self.rank
        np.reshape(cpu, (5, self.size * 3), order='F')
        gpu = gpuarray.asarray(cpu, context=self.ctx)

        resgpu = self.gpucomm.reduce_scatter(gpu, 'sum')
        check_all(resgpu, exp)
        assert resgpu.flags['F_CONTIGUOUS'] is True