def test_batch_sum(self): """ Make sure batch summing works. """ num_outs = 3 for case in self.cases: space.initialize_space(case['shape']) x = [Out(case['dtype'], op='sum') for k in range(num_outs)] x_cpu_data = [np.random.randn(*case['shape'][1:])\ .astype(case['dtype']) for k in range(num_outs)] if case['dtype'] in (np.complex64, np.complex128): for k in range(num_outs): x_cpu_data[k] = (1 + 1j) * x_cpu_data[k] res_gold = [] for k in range(num_outs): x[k].data.set(x_cpu_data[k]) res_gold.append(comm.allreduce(np.sum(x_cpu_data[k].flatten()))) batch_reduce(*x) res_gpu = [x_indiv.get() for x_indiv in x] for k in range(num_outs): err = abs(res_gold[k] - res_gpu[k]) / abs(res_gold[k]) if case['dtype'] in (np.float32, np.complex64): self.assertTrue(err < 1e-3) else: self.assertTrue(err < 1e-10)
for case in self.cases: space.initialize_space(case['shape']) x = [Out(case['dtype'], op='sum') for k in range(num_outs)] x_cpu_data = [np.random.randn(*case['shape'][1:])\ .astype(case['dtype']) for k in range(num_outs)] if case['dtype'] in (np.complex64, np.complex128): for k in range(num_outs): x_cpu_data[k] = (1 + 1j) * x_cpu_data[k] res_gold = [] for k in range(num_outs): x[k].data.set(x_cpu_data[k]) res_gold.append(comm.allreduce(np.sum( x_cpu_data[k].flatten()))) batch_reduce(*x) res_gpu = [x_indiv.get() for x_indiv in x] for k in range(num_outs): err = abs(res_gold[k] - res_gpu[k]) / abs(res_gold[k]) if case['dtype'] in (np.float32, np.complex64): self.assertTrue(err < 1e-3) else: self.assertTrue(err < 1e-10) if __name__ == '__main__': unittest.main()
def execute(cfg, *args, **kwargs): # Parse keyword arguments. post_sync_grids = kwargs.get('post_sync', None) # Parse the inputs. gpu_params = [] for k in range(len(params)): if params[k]['gce_type'] is 'number': gpu_params.append(params[k]['dtype'](args[k])) elif params[k]['gce_type'] is 'const': # Load Const. gpu_params.append(args[k].data.ptr) # Const no longer actually "const" in cuda code. # d_ptr, size_in_bytes = my_get_global(params[k]['name']) # drv.memcpy_dtod(d_ptr, args[k].data.gpudata, size_in_bytes) elif params[k]['gce_type'] is 'grid': if args[k]._xlap is 0: gpu_params.append(args[k].data.ptr) else: gpu_params.append(args[k].data.ptr + \ args[k]._xlap_offset) elif params[k]['gce_type'] is 'out': args[k].data.fill(args[k].dtype(0)) # Initialize the Out. gpu_params.append(args[k].data.ptr) else: raise TypeError('Invalid input type.') # See if we need to synchronize grids after kernel execution. if post_sync_grids is None: sync_pad = 0 else: sync_pad = max([g._xlap for g in post_sync_grids]) start2.record(stream) comm.Barrier() start.record(stream) # Execute kernel in padded regions first. execute_range(x_start, x_start + sync_pad, gpu_params, cfg, stream) execute_range(x_end - sync_pad, x_end, gpu_params, cfg, stream) pad_done.record(stream) # Just for timing purposes. stream.synchronize() # Wait for execution to finish. # Begin kernel execution in remaining "core" region. execute_range(x_start + sync_pad, x_end - sync_pad, gpu_params, cfg, stream) comp_done.record(stream) # Timing only. # While core kernel is executing, perform synchronization. if post_sync_grids is not None: # Synchronization needed. for grid in post_sync_grids: grid.synchronize_start() # Start synchronization. # Keep on checking until everything is done. while not (all([grid.synchronize_isdone() \ for grid in post_sync_grids]) and \ stream.is_done()): pass else: # Nothing to synchronize. stream.synchronize() # Just wait for execution to finish. sync_done.record() # Timing. # Obtain the result for all Outs. batch_reduce(*[args[k] for k in range(len(params)) \ if params[k]['gce_type'] is 'out']) all_done.record() # Timing. all_done.synchronize() # The delay between sync_done and comp_done should be small. # Otherwise, the parallelization efficiency is suffering. print "(%d)" % comm.Get_rank(), for milliseconds in [event_done.time_since(start) for event_done in \ (start2, pad_done, sync_done, comp_done, all_done)]: print "%1.4f " % milliseconds, print cfg['block_shape'] return comp_done.time_since(start) # Return time needed to execute the function.
def execute(cfg, *args, **kwargs): # Parse keyword arguments. post_sync_grids = kwargs.get('post_sync', None) # Parse the inputs. gpu_params = [] for k in range(len(params)): if params[k]['gce_type'] is 'number': gpu_params.append(params[k]['dtype'](args[k])) elif params[k]['gce_type'] is 'const': # Load Const. gpu_params.append(args[k].data.ptr) # Const no longer actually "const" in cuda code. # d_ptr, size_in_bytes = my_get_global(params[k]['name']) # drv.memcpy_dtod(d_ptr, args[k].data.gpudata, size_in_bytes) elif params[k]['gce_type'] is 'grid': if args[k]._xlap is 0: gpu_params.append(args[k].data.ptr) else: gpu_params.append(args[k].data.ptr + \ args[k]._xlap_offset) elif params[k]['gce_type'] is 'out': args[k].data.fill(args[k].dtype(0)) # Initialize the Out. gpu_params.append(args[k].data.ptr) else: raise TypeError('Invalid input type.') # See if we need to synchronize grids after kernel execution. if post_sync_grids is None: sync_pad = 0 else: sync_pad = max([g._xlap for g in post_sync_grids]) start2.record(stream) comm.Barrier() start.record(stream) # Execute kernel in padded regions first. execute_range(x_start, x_start + sync_pad, gpu_params, cfg, stream) execute_range(x_end - sync_pad, x_end, gpu_params, cfg, stream) pad_done.record(stream) # Just for timing purposes. stream.synchronize() # Wait for execution to finish. # Begin kernel execution in remaining "core" region. execute_range(x_start + sync_pad, x_end - sync_pad, gpu_params, cfg, stream) comp_done.record(stream) # Timing only. # While core kernel is executing, perform synchronization. if post_sync_grids is not None: # Synchronization needed. for grid in post_sync_grids: grid.synchronize_start() # Start synchronization. # Keep on checking until everything is done. while not (all([grid.synchronize_isdone() \ for grid in post_sync_grids]) and \ stream.is_done()): pass else: # Nothing to synchronize. stream.synchronize() # Just wait for execution to finish. sync_done.record() # Timing. # Obtain the result for all Outs. batch_reduce(*[args[k] for k in range(len(params)) \ if params[k]['gce_type'] is 'out']) all_done.record() # Timing. all_done.synchronize() # The delay between sync_done and comp_done should be small. # Otherwise, the parallelization efficiency is suffering. print "(%d)" % comm.Get_rank(), for milliseconds in [event_done.time_since(start) for event_done in \ (start2, pad_done, sync_done, comp_done, all_done)]: print "%1.4f " % milliseconds, print cfg['block_shape'] return comp_done.time_since( start) # Return time needed to execute the function.