def profile(shape=(1000, 1000), dtype='float64', rng=(-1, 1)): print("\n### Profiling worker") print() print("### shape =", shape) print("### dtype =", dtype) print("### range =", sorted(rng)) rang = abs(rng[1] - rng[0]) inp = np.random.random(shape) * rang + min(rng) inp = inp.astype(dtype) sinp = gpuarray.asarray(inp, context=worker.gpuctx) out = np.empty_like(inp) sout = gpuarray.asarray(out, context=worker.gpuctx) print("\n### Profiling worker.all_reduce") print("## First call to worker.all_reduce") cProfile.runctx("worker.all_reduce(sinp, '+', sout)", globals(), locals(), filename="worker.prof") s = pstats.Stats("worker.prof") s.strip_dirs().sort_stats("time").print_stats() assert_allclose(inp * worker.global_size, np.asarray(sout)) print("## Second call to worker.all_reduce") cProfile.runctx("worker.all_reduce(sinp, '+', sout)", globals(), locals(), filename="worker.prof") s = pstats.Stats("worker.prof") s.strip_dirs().sort_stats("time").print_stats() assert_allclose(inp * worker.global_size, np.asarray(sout)) if worker._multinode: print("## Note that there must be difference between the first and") print("## the second call as a result of the extra call to worker.shared") print("## during the first time.")
def test_linked_shared(self): inp = np.arange(32, dtype='float64') sinp = gpuarray.asarray(inp, context=self.ctx) insize = sinp.size * sinp.itemsize out = np.empty_like(inp) sout = gpuarray.asarray(out, context=self.ctx) outsize = sout.size * sout.itemsize if self.worker._multinode: try: self.worker.shared_arrays[outsize] self.fail( "'sout''s size has not been linked yet to a shared buffer") except KeyError: pass try: self.worker.shared_arrays[insize] self.fail( "'sinp''s size has not been linked yet to a shared buffer") except KeyError: pass self.worker.all_reduce(sinp, '+', sout) if self.worker._multinode: try: self.worker.shared_arrays[outsize] except KeyError: self.fail( "`sout`'s size should have been linked to a shared buffer") try: self.worker.shared_arrays[insize] except KeyError: self.fail( "`sinp`'s size should have been linked to a shared buffer") expected = self.total_nw * inp actual = np.asarray(sout) assert np.allclose(expected, actual) self.worker.all_reduce(sout, '*', sout) if self.worker._multinode: try: self.worker.shared_arrays[outsize] except KeyError: self.fail( "`sout`'s size should have been linked to a shared buffer") try: self.worker.shared_arrays[insize] except KeyError: self.fail( "`sinp`'s size should have been linked to a shared buffer") expected = expected**self.total_nw actual = np.asarray(sout) assert np.allclose(expected, actual)
def test_interface1(self): inp = np.arange(32, dtype='float64') sinp = gpuarray.asarray(inp, context=self.ctx) out = np.empty_like(inp) sout = gpuarray.asarray(out, context=self.ctx) self.worker.all_reduce(sinp, '+', sout) expected = self.total_nw * inp actual = np.asarray(sout) assert np.allclose(expected, actual)
def test_linked_shared(self): inp = np.arange(32, dtype='float64') sinp = gpuarray.asarray(inp, context=self.ctx) insize = sinp.size * sinp.itemsize out = np.empty_like(inp) sout = gpuarray.asarray(out, context=self.ctx) outsize = sout.size * sout.itemsize if self.worker._multinode: try: self.worker.shared_arrays[outsize] self.fail("'sout''s size has not been linked yet to a shared buffer") except KeyError: pass try: self.worker.shared_arrays[insize] self.fail("'sinp''s size has not been linked yet to a shared buffer") except KeyError: pass self.worker.all_reduce(sinp, '+', sout) if self.worker._multinode: try: self.worker.shared_arrays[outsize] except KeyError: self.fail("`sout`'s size should have been linked to a shared buffer") try: self.worker.shared_arrays[insize] except KeyError: self.fail("`sinp`'s size should have been linked to a shared buffer") expected = self.total_nw * inp actual = np.asarray(sout) assert np.allclose(expected, actual) self.worker.all_reduce(sout, '*', sout) if self.worker._multinode: try: self.worker.shared_arrays[outsize] except KeyError: self.fail("`sout`'s size should have been linked to a shared buffer") try: self.worker.shared_arrays[insize] except KeyError: self.fail("`sinp`'s size should have been linked to a shared buffer") expected = expected ** self.total_nw actual = np.asarray(sout) assert np.allclose(expected, actual)
def test_hostfromgpu_shape_i(): """ Test that the shape is lifted over hostfromgpu """ m = mode_with_gpu.including('local_dot_to_dot22', 'local_dot22_to_dot22scalar', 'specialize') a = T.fmatrix('a') ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))() av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32') cv = gpuarray.asarray(numpy.random.rand(5, 4), dtype='float32', context=get_context(test_ctx_name)) f = theano.function([a], GpuFromHost(test_ctx_name)(a), mode=m) assert any( isinstance(x.op, GpuFromHost) for x in f.maker.fgraph.toposort()) f = theano.function([a], GpuFromHost(test_ctx_name)(a).shape, mode=m) topo = f.maker.fgraph.toposort() assert isinstance(topo[0].op, T.opt.Shape_i) assert isinstance(topo[1].op, T.opt.Shape_i) assert isinstance(topo[2].op, T.opt.MakeVector) assert tuple(f(av)) == (5, 4) f = theano.function([ca], host_from_gpu(ca), mode=m) assert host_from_gpu in [x.op for x in f.maker.fgraph.toposort()] f = theano.function([ca], host_from_gpu(ca).shape, mode=m) topo = f.maker.fgraph.toposort() assert isinstance(topo[0].op, theano.compile.Shape_i) assert isinstance(topo[1].op, theano.compile.Shape_i) assert isinstance(topo[2].op, theano.tensor.opt.MakeVector) assert tuple(f(cv)) == (5, 4)
def test_hostfromgpu_shape_i(): # Test that the shape is lifted over hostfromgpu m = mode_with_gpu.including('local_dot_to_dot22', 'local_dot22_to_dot22scalar', 'specialize') a = T.fmatrix('a') ca = theano.gpuarray.type.GpuArrayType('float32', (False, False))() av = np.asarray(np.random.rand(5, 4), dtype='float32') cv = gpuarray.asarray(np.random.rand(5, 4), dtype='float32', context=get_context(test_ctx_name)) f = theano.function([a], GpuFromHost(test_ctx_name)(a), mode=m) assert any(isinstance(x.op, GpuFromHost) for x in f.maker.fgraph.toposort()) f = theano.function([a], GpuFromHost(test_ctx_name)(a).shape, mode=m) topo = f.maker.fgraph.toposort() assert isinstance(topo[0].op, T.opt.Shape_i) assert isinstance(topo[1].op, T.opt.Shape_i) assert isinstance(topo[2].op, T.opt.MakeVector) assert tuple(f(av)) == (5, 4) f = theano.function([ca], host_from_gpu(ca), mode=m) assert host_from_gpu in [x.op for x in f.maker.fgraph.toposort()] f = theano.function([ca], host_from_gpu(ca).shape, mode=m) topo = f.maker.fgraph.toposort() assert isinstance(topo[0].op, theano.compile.Shape_i) assert isinstance(topo[1].op, theano.compile.Shape_i) assert isinstance(topo[2].op, theano.tensor.opt.MakeVector) assert tuple(f(cv)) == (5, 4)
def test_broadcast(self): if self.rank == 0: cpu, gpu = gen_gpuarray((3, 4, 5), order='c', incr=self.rank, ctx=self.ctx) else: cpu = np.zeros((3, 4, 5), dtype='float32') gpu = gpuarray.asarray(cpu, context=self.ctx) if self.rank == 0: self.gpucomm.broadcast(gpu) else: self.gpucomm.broadcast(gpu, root=0) self.mpicomm.Bcast(cpu, root=0) assert np.allclose(gpu, cpu)
def profile(shape=(1000, 1000), dtype='float64', rng=(-1, 1)): print("\n### Profiling worker") print() print("### shape =", shape) print("### dtype =", dtype) print("### range =", sorted(rng)) rang = abs(rng[1] - rng[0]) inp = np.random.random(shape) * rang + min(rng) inp = inp.astype(dtype) sinp = gpuarray.asarray(inp, context=worker.gpuctx) out = np.empty_like(inp) sout = gpuarray.asarray(out, context=worker.gpuctx) print("\n### Profiling worker.all_reduce") print("## First call to worker.all_reduce") cProfile.runctx("worker.all_reduce(sinp, '+', sout)", globals(), locals(), filename="worker.prof") s = pstats.Stats("worker.prof") s.strip_dirs().sort_stats("time").print_stats() assert_allclose(inp * worker.global_size, np.asarray(sout)) print("## Second call to worker.all_reduce") cProfile.runctx("worker.all_reduce(sinp, '+', sout)", globals(), locals(), filename="worker.prof") s = pstats.Stats("worker.prof") s.strip_dirs().sort_stats("time").print_stats() assert_allclose(inp * worker.global_size, np.asarray(sout)) if worker._multinode: print("## Note that there must be difference between the first and") print( "## the second call as a result of the extra call to worker.shared" ) print("## during the first time.")
def benchmark(shape=(1000, 1000), dtype='float64', rng=(-1, 1), number=10): print("\n### Benchmarking worker") print() print("### shape =", shape) print("### dtype =", dtype) print("### range =", sorted(rng)) print("### num of iterations =", number) rang = abs(rng[1] - rng[0]) inp = np.random.random(shape) * rang + min(rng) inp = inp.astype(dtype) sinp = gpuarray.asarray(inp, context=worker.gpuctx) out = np.empty_like(inp) sout = gpuarray.asarray(out, context=worker.gpuctx) print("\n## Benchmarking worker.shared") print("# First call") start = timer() worker.shared(sinp) end = timer() print("Time:", end - start) print("# Second call") start = timer() worker.shared(sinp) end = timer() print("Time:", end - start) print("\n## Benchmarking worker.all_reduce") print("# Timing worker.all_reduce w/o calls to worker.shared") ttime = 0 for _ in range(number): start = timer() worker.all_reduce(sinp, '+', sout) end = timer() ttime += end - start assert_allclose(inp * worker.global_size, np.asarray(sout)) print("Mean time:", ttime / number)
def as_gpuarray_variable(x, context_name): # If this is already some form of variable, try to avoid an extra transfer if isinstance(x, Variable): while True: # If we are already a GpuArrayVariable in the right context # then there is nothing to do. if (isinstance(x.type, GpuArrayType) and x.type.context_name == context_name): return x # If x is the result of a transfer, try to dig through. if getattr(x, 'owner', None): if isinstance(x.owner.op, HostFromGpu): x = x.owner.inputs[0] continue if isinstance(x.owner.op, GpuFromHost): x = x.owner.inputs[0] continue if isinstance(x.owner.op, GpuToGpu): x = x.owner.inputs[0] continue # If none of the conditions where met, then continue with # the rest of the body break # If we couldn't deal with transfers, then maybe it's a tensor if isinstance(x.type, tensor.TensorType): return GpuFromHost(context_name)(x) # Try _as_GpuArrayVariable if possible if hasattr(x, '_as_GpuArrayVariable'): return x._as_GpuArrayVariable(context_name) # If it didn't work try for a constant ctx = get_context(context_name) if isinstance(x, gpuarray.GpuArray): if x.context.ptr != ctx.ptr: x = x.transfer(ctx) x = gpuarray.asarray(x, context=ctx) bcast = [(s == 1) for s in x.shape] return GpuArrayConstant(GpuArrayType(dtype=x.dtype, broadcastable=bcast, context_name=context_name), x)
def as_gpuarray_variable(x, context_name): # If this is already some form of variable, try to avoid an extra transfer if isinstance(x, Variable): while True: # If we are already a GpuArrayVariable in the right context # then there is nothing to do. if (isinstance(x.type, GpuArrayType) and x.type.context_name == context_name): return x # If x is the result of a transfer, try to dig through. if getattr(x, 'owner', None): if isinstance(x.owner.op, HostFromGpu): x = x.owner.inputs[0] continue if isinstance(x.owner.op, GpuFromHost): x = x.owner.inputs[0] continue if isinstance(x.owner.op, GpuToGpu): x = x.owner.inputs[0] continue # If none of the conditions where met, then continue with # the rest of the body break # If we couldn't deal with transfers, then maybe it's a tensor if isinstance(x.type, tensor.TensorType): return GpuFromHost(context_name)(x) # Try _as_GpuArrayVariable if possible if hasattr(x, '_as_GpuArrayVariable'): return x._as_GpuArrayVariable(context_name) # If it didn't work try for a constant ctx = get_context(context_name) if isinstance(x, gpuarray.GpuArray): if x.context.ptr != ctx.ptr: x = x.transfer(ctx) x = gpuarray.asarray(x, context=ctx) bcast = [(s == 1) for s in x.shape] return GpuArrayConstant( GpuArrayType(dtype=x.dtype, broadcastable=bcast, context_name=context_name), x)
def test_all_gather(self): texp = np.arange(self.size * 10, dtype='int32') cpu = np.arange(self.rank * 10, self.rank * 10 + 10, dtype='int32') a = cpu gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=0) check_all(resgpu, texp) a = cpu.reshape((2, 5), order='C') exp = texp.reshape((2 * self.size, 5), order='C') gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=0) check_all(resgpu, exp) a = cpu.reshape((2, 5), order='C') exp = texp.reshape((self.size, 2, 5), order='C') gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=1) check_all(resgpu, exp) a = cpu.reshape((2, 5), order='C') exp = texp.reshape((self.size, 1, 1, 2, 5), order='C') gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=3) check_all(resgpu, exp) a = cpu.reshape((5, 2), order='F') exp = texp.reshape((5, 2 * self.size), order='F') gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=0) check_all(resgpu, exp) a = cpu.reshape((5, 2), order='F') exp = texp.reshape((5, 2, self.size), order='F') gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=1) check_all(resgpu, exp) a = cpu.reshape((5, 2), order='F') exp = texp.reshape((5, 2, 1, 1, self.size), order='F') gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=3) check_all(resgpu, exp) with self.assertRaises(Exception): resgpu = self.gpucomm.all_gather(gpu, nd_up=-2)
def test_all_gather(self): texp = np.arange(self.size * 10, dtype='int32') cpu = np.arange(self.rank * 10, self.rank * 10 + 10, dtype='int32') a = cpu gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=0) check_all(resgpu, texp) a = cpu.reshape((2, 5), order='C') exp = texp.reshape((2 * self.size, 5), order='C') gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=0) check_all(resgpu, exp) a = cpu.reshape((2, 5), order='C') exp = texp.reshape((self.size, 2, 5), order='C') gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=1) check_all(resgpu, exp) a = cpu.reshape((2, 5), order='C') exp = texp.reshape((self.size, 1, 1, 2, 5), order='C') gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=3) check_all(resgpu, exp) a = cpu.reshape((5, 2), order='F') exp = texp.reshape((5, 2 * self.size), order='F') gpu = gpuarray.asarray(a, context=self.ctx, order='F') resgpu = self.gpucomm.all_gather(gpu, nd_up=0) check_all(resgpu, exp) a = cpu.reshape((5, 2), order='F') exp = texp.reshape((5, 2, self.size), order='F') gpu = gpuarray.asarray(a, context=self.ctx, order='F') resgpu = self.gpucomm.all_gather(gpu, nd_up=1) check_all(resgpu, exp) a = cpu.reshape((5, 2), order='F') exp = texp.reshape((5, 2, 1, 1, self.size), order='F') gpu = gpuarray.asarray(a, context=self.ctx, order='F') resgpu = self.gpucomm.all_gather(gpu, nd_up=3) check_all(resgpu, exp) with self.assertRaises(Exception): resgpu = self.gpucomm.all_gather(gpu, nd_up=-2)
def test_reduce_scatter(self): texp = self.size * np.arange(5 * self.size) + sum(range(self.size)) exp = texp[self.rank * 5:self.rank * 5 + 5] # order c cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (self.size, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = gpuarray.empty((5, ), dtype='int64', order='C', context=self.ctx) self.gpucomm.reduce_scatter(gpu, 'sum', resgpu) assert np.allclose(resgpu, exp) # order f cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (5, self.size), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = gpuarray.empty((5, ), dtype='int64', order='F', context=self.ctx) self.gpucomm.reduce_scatter(gpu, 'sum', resgpu) assert np.allclose(resgpu, exp) # make result order c (one less dim) cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (self.size, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['C_CONTIGUOUS'] is True # c-contiguous split problem (for size == 1, it can always be split) if self.size != 1: cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank np.reshape(cpu, (self.size + 1, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) with self.assertRaises(TypeError): resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') # make result order f (one less dim) cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (5, self.size), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['F_CONTIGUOUS'] is True # f-contiguous split problem (for size == 1, it can always be split) if self.size != 1: cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank np.reshape(cpu, (5, self.size + 1), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) with self.assertRaises(TypeError): resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') # make result order c (same dim - less size) texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size)) exp = texp[self.rank * 15:self.rank * 15 + 15] np.reshape(exp, (3, 5), order='C') cpu = np.arange(5 * self.size * 3) + self.rank np.reshape(cpu, (self.size * 3, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['C_CONTIGUOUS'] is True # make result order f (same dim - less size) texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size)) exp = texp[self.rank * 15:self.rank * 15 + 15] np.reshape(exp, (5, 3), order='F') cpu = np.arange(5 * self.size * 3) + self.rank np.reshape(cpu, (5, self.size * 3), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['F_CONTIGUOUS'] is True
def test_reduce_scatter(self): texp = self.size * np.arange(5 * self.size) + sum(range(self.size)) exp = texp[self.rank * 5:self.rank * 5 + 5] # order c cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (self.size, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = gpuarray.empty((5,), dtype='int64', order='C', context=self.ctx) self.gpucomm.reduce_scatter(gpu, 'sum', resgpu) assert np.allclose(resgpu, exp) # order f cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (5, self.size), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = gpuarray.empty((5,), dtype='int64', order='F', context=self.ctx) self.gpucomm.reduce_scatter(gpu, 'sum', resgpu) assert np.allclose(resgpu, exp) # make result order c (one less dim) cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (self.size, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['C_CONTIGUOUS'] is True # c-contiguous split problem (for size == 1, it can always be split) if self.size != 1: cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank np.reshape(cpu, (self.size + 1, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) with self.assertRaises(TypeError): resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') # make result order f (one less dim) cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (5, self.size), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['F_CONTIGUOUS'] is True # f-contiguous split problem (for size == 1, it can always be split) if self.size != 1: cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank np.reshape(cpu, (5, self.size + 1), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) with self.assertRaises(TypeError): resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') # make result order c (same dim - less size) texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size)) exp = texp[self.rank * 15:self.rank * 15 + 15] np.reshape(exp, (3, 5), order='C') cpu = np.arange(5 * self.size * 3) + self.rank np.reshape(cpu, (self.size * 3, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['C_CONTIGUOUS'] is True # make result order f (same dim - less size) texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size)) exp = texp[self.rank * 15:self.rank * 15 + 15] np.reshape(exp, (5, 3), order='F') cpu = np.arange(5 * self.size * 3) + self.rank np.reshape(cpu, (5, self.size * 3), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['F_CONTIGUOUS'] is True