def test_padded_kernel(self): """ Implement a simple padded kernel. """ for case in self.cases: # Form data to work on. space.initialize_space(case['shape']) x_np = comm.allreduce(np.random.randn(*case['shape']).astype(case['dtype'])) x = Grid(x_np, x_overlap=1) s_np = comm.allreduce(np.random.randn(1).astype(case['dtype'])) s = Const(s_np) z = Out(case['dtype']) # Make a kernel. code = Template(""" if (_in_local && _in_global) { x(0,0,0) = s(0) * x(0,0,0); z += a * x(0,0,0); } """).render() fun = Kernel(code, \ ('a', 'number', case['dtype']), \ ('x', 'grid', x.dtype), \ ('s', 'const', s.dtype, s.data.size), \ ('z', 'out', z.dtype), \ padding=(1,1,1,1)) # Execute and check the result. fun(case['dtype'](2), x, s, z) gpu_sum = z.get() cpu_sum = np.sum(2.0 * s_np * x_np) err = abs(gpu_sum - cpu_sum) / abs(cpu_sum) # print case, err if case['dtype'] in (np.float32, np.complex64): self.assertTrue(err < 1e-2, (case, err)) else: self.assertTrue(err < 1e-6, (case, err))
def test_batch_sum(self): """ Make sure batch summing works. """ num_outs = 3 for case in self.cases: space.initialize_space(case['shape']) x = [Out(case['dtype'], op='sum') for k in range(num_outs)] x_cpu_data = [np.random.randn(*case['shape'][1:])\ .astype(case['dtype']) for k in range(num_outs)] if case['dtype'] in (np.complex64, np.complex128): for k in range(num_outs): x_cpu_data[k] = (1 + 1j) * x_cpu_data[k] res_gold = [] for k in range(num_outs): x[k].data.set(x_cpu_data[k]) res_gold.append(comm.allreduce(np.sum(x_cpu_data[k].flatten()))) batch_reduce(*x) res_gpu = [x_indiv.get() for x_indiv in x] for k in range(num_outs): err = abs(res_gold[k] - res_gpu[k]) / abs(res_gold[k]) if case['dtype'] in (np.float32, np.complex64): self.assertTrue(err < 1e-3) else: self.assertTrue(err < 1e-10)
def test_simple_kernel(self): """ Implement a simple kernel. """ for case in self.cases: # Form data to work on. space.initialize_space(case['shape']) x_np = comm.allreduce(np.random.randn(*case['shape']).astype(case['dtype'])) x = Grid(x_np, x_overlap=2) s_np = comm.allreduce(np.random.randn(case['shape'][0],1,1).astype(case['dtype'])) s = Const(s_np) z = Out(case['dtype']) # Make a kernel. code = Template(""" if (_in_local && _in_global) { z += a * s(_X) * x(0,0,0); // z += a * x(0,0,0); } """).render() fun = Kernel(code, \ ('a', 'number', case['dtype']), \ ('x', 'grid', x.dtype), \ ('s', 'const', s.dtype), \ ('z', 'out', z.dtype), \ shape_filter='all') # Execute and check the result. # fun() while fun.exec_configs: # for k in range(40): fun(case['dtype'](2.0), x, s, z) # fun(case['dtype'](2.0), x, z) gpu_sum = z.get() cpu_sum = np.sum(2 * s_np * x_np) # cpu_sum = np.sum(2 * x_np) err = abs(gpu_sum - cpu_sum) / abs(cpu_sum) if case['dtype'] in (np.float32, np.complex64): self.assertTrue(err < 1e-2, (case, err)) else: self.assertTrue(err < 1e-6, (case, err))
def test_sum(self): """ Make sure summing works. """ for case in self.cases: space.initialize_space(case['shape']) x = Out(case['dtype'], op='sum') x_cpu_data = np.random.randn(*case['shape'][1:]).astype(case['dtype']) if case['dtype'] in (np.complex64, np.complex128): x_cpu_data = (1 + 1j) * x_cpu_data x.data.set(x_cpu_data) res_gold = comm.allreduce(np.sum(x_cpu_data.flatten())) x.reduce() err = abs(res_gold - x.get()) / abs(res_gold) if case['dtype'] in (np.float32, np.complex64): self.assertTrue(err < 1e-3) else: self.assertTrue(err < 1e-10)
def test_sum(self): """ Make sure summing works. """ for case in self.cases: space.initialize_space(case['shape']) x = Out(case['dtype'], op='sum') x_cpu_data = np.random.randn(*case['shape'][1:]).astype( case['dtype']) if case['dtype'] in (np.complex64, np.complex128): x_cpu_data = (1 + 1j) * x_cpu_data x.data.set(x_cpu_data) res_gold = comm.allreduce(np.sum(x_cpu_data.flatten())) x.reduce() err = abs(res_gold - x.get()) / abs(res_gold) if case['dtype'] in (np.float32, np.complex64): self.assertTrue(err < 1e-3) else: self.assertTrue(err < 1e-10)
def batch_reduce(*outs): """ Optimal (compared to self.reduce) when communication cost is latency bound. """ results = comm.allreduce(np.array([ga.sum(out.data).get() for out in outs])) for k in range(len(outs)): outs[k].result = results[k]
def reduce(self): """ Compute the result. """ self.result = comm.allreduce(ga.sum(self.data).get())
""" Make sure batch summing works. """ num_outs = 3 for case in self.cases: space.initialize_space(case['shape']) x = [Out(case['dtype'], op='sum') for k in range(num_outs)] x_cpu_data = [np.random.randn(*case['shape'][1:])\ .astype(case['dtype']) for k in range(num_outs)] if case['dtype'] in (np.complex64, np.complex128): for k in range(num_outs): x_cpu_data[k] = (1 + 1j) * x_cpu_data[k] res_gold = [] for k in range(num_outs): x[k].data.set(x_cpu_data[k]) res_gold.append(comm.allreduce(np.sum( x_cpu_data[k].flatten()))) batch_reduce(*x) res_gpu = [x_indiv.get() for x_indiv in x] for k in range(num_outs): err = abs(res_gold[k] - res_gpu[k]) / abs(res_gold[k]) if case['dtype'] in (np.float32, np.complex64): self.assertTrue(err < 1e-3) else: self.assertTrue(err < 1e-10) if __name__ == '__main__': unittest.main()
def _create_tempdir(request, mode=None): """ Adapted from DOLFIN's dolfin_utils/test/fixtures.py. """ # Get directory name of test_foo.py file testfile = request.module.__file__ testfiledir = os.path.dirname(os.path.abspath(testfile)) # Construct name test_foo_tempdir from name test_foo.py testfilename = os.path.basename(testfile) if hasattr(request.config, "slaveinput"): outputname = testfilename.replace( ".py", "_tempdir_{}".format(request.config.slaveinput["slaveid"])) else: outputname = testfilename.replace(".py", "_tempdir") # Get function name test_something from test_foo.py function = request.function.__name__ if mode == "save": function = function.replace("_save", "_io") elif mode == "load": function = function.replace("_load", "_io") # Join all of these to make a unique path for this test function basepath = os.path.join(testfiledir, outputname) path = os.path.join(basepath, function) # Add a sequence number to avoid collisions when tests are otherwise parameterized if COMM_WORLD.rank == 0: _create_tempdir._sequencenumber[path] += 1 sequencenumber = _create_tempdir._sequencenumber[path] sequencenumber = COMM_WORLD.allreduce(sequencenumber, op=SUM) else: sequencenumber = COMM_WORLD.allreduce(0, op=SUM) path += "__" + str(sequencenumber) # Delete and re-create directory on root node if COMM_WORLD.rank == 0: # First time visiting this basepath, delete the old and create # a new if mode is not load if basepath not in _create_tempdir._basepaths: _create_tempdir._basepaths.add(basepath) if mode == "load": assert os.path.exists(basepath) else: if os.path.exists(basepath): shutil.rmtree(basepath) # Make sure we have the base path test_foo_tempdir for # this test_foo.py file if not os.path.exists(basepath): os.mkdir(basepath) # Delete path from old test run if mode is not load if mode == "load": assert os.path.exists(path) else: if os.path.exists(path): shutil.rmtree(path) # Make sure we have the path for this test execution: # e.g. test_foo_tempdir/test_something__3 if not os.path.exists(path): os.mkdir(path) COMM_WORLD.barrier() return path