Пример #1
0
    def test_batch_sum(self):
        """ Make sure batch summing works. """
        num_outs = 3
        for case in self.cases:
            space.initialize_space(case['shape'])
            x = [Out(case['dtype'], op='sum') for k in range(num_outs)]
            x_cpu_data = [np.random.randn(*case['shape'][1:])\
                            .astype(case['dtype']) for k in range(num_outs)]
                    
            if case['dtype'] in (np.complex64, np.complex128):
                for k in range(num_outs):
                    x_cpu_data[k] = (1 + 1j) * x_cpu_data[k]

            res_gold = []
            for k in range(num_outs):
                x[k].data.set(x_cpu_data[k])
                res_gold.append(comm.allreduce(np.sum(x_cpu_data[k].flatten())))

            batch_reduce(*x)
            res_gpu = [x_indiv.get() for x_indiv in x]

            for k in range(num_outs):
                err = abs(res_gold[k] - res_gpu[k]) / abs(res_gold[k])

                if case['dtype'] in (np.float32, np.complex64):
                    self.assertTrue(err < 1e-3)
                else:
                    self.assertTrue(err < 1e-10)
Пример #2
0
        for case in self.cases:
            space.initialize_space(case['shape'])
            x = [Out(case['dtype'], op='sum') for k in range(num_outs)]
            x_cpu_data = [np.random.randn(*case['shape'][1:])\
                            .astype(case['dtype']) for k in range(num_outs)]

            if case['dtype'] in (np.complex64, np.complex128):
                for k in range(num_outs):
                    x_cpu_data[k] = (1 + 1j) * x_cpu_data[k]

            res_gold = []
            for k in range(num_outs):
                x[k].data.set(x_cpu_data[k])
                res_gold.append(comm.allreduce(np.sum(
                    x_cpu_data[k].flatten())))

            batch_reduce(*x)
            res_gpu = [x_indiv.get() for x_indiv in x]

            for k in range(num_outs):
                err = abs(res_gold[k] - res_gpu[k]) / abs(res_gold[k])

                if case['dtype'] in (np.float32, np.complex64):
                    self.assertTrue(err < 1e-3)
                else:
                    self.assertTrue(err < 1e-10)


if __name__ == '__main__':
    unittest.main()
Пример #3
0
        def execute(cfg, *args, **kwargs):

            # Parse keyword arguments.
            post_sync_grids = kwargs.get('post_sync', None)

            # Parse the inputs.
            gpu_params = []
            for k in range(len(params)):
                if params[k]['gce_type'] is 'number':
                    gpu_params.append(params[k]['dtype'](args[k]))
                elif params[k]['gce_type'] is 'const': # Load Const.
                    gpu_params.append(args[k].data.ptr)
                    # Const no longer actually "const" in cuda code.
#                     d_ptr, size_in_bytes = my_get_global(params[k]['name'])
#                     drv.memcpy_dtod(d_ptr, args[k].data.gpudata, size_in_bytes)
                elif params[k]['gce_type'] is 'grid': 
                    if args[k]._xlap is 0:
                        gpu_params.append(args[k].data.ptr)
                    else:
                        gpu_params.append(args[k].data.ptr + \
                                            args[k]._xlap_offset)
                elif params[k]['gce_type'] is 'out': 
                    args[k].data.fill(args[k].dtype(0)) # Initialize the Out.
                    gpu_params.append(args[k].data.ptr)
                else:
                    raise TypeError('Invalid input type.')

            # See if we need to synchronize grids after kernel execution.
            if post_sync_grids is None:
                sync_pad = 0
            else:
                sync_pad = max([g._xlap for g in post_sync_grids])

            start2.record(stream)
            comm.Barrier()
            start.record(stream)
            
            # Execute kernel in padded regions first.
            execute_range(x_start, x_start + sync_pad, gpu_params, cfg, stream)
            execute_range(x_end - sync_pad, x_end, gpu_params, cfg, stream)
            pad_done.record(stream) # Just for timing purposes.
            stream.synchronize() # Wait for execution to finish.

            # Begin kernel execution in remaining "core" region.
            execute_range(x_start + sync_pad, x_end - sync_pad, gpu_params, cfg, stream)
            comp_done.record(stream) # Timing only.

            # While core kernel is executing, perform synchronization.
            if post_sync_grids is not None: # Synchronization needed.
                for grid in post_sync_grids:
                    grid.synchronize_start() # Start synchronization.

                # Keep on checking until everything is done.
                while not (all([grid.synchronize_isdone() \
                                for grid in post_sync_grids]) and \
                        stream.is_done()):
                    pass

            else: # Nothing to synchronize.
                stream.synchronize() # Just wait for execution to finish.

            sync_done.record() # Timing.

            # Obtain the result for all Outs.
            batch_reduce(*[args[k] for k in range(len(params)) \
                                if params[k]['gce_type'] is 'out'])
            all_done.record() # Timing.
            all_done.synchronize()

            # The delay between sync_done and comp_done should be small.
            # Otherwise, the parallelization efficiency is suffering.
            print "(%d)" % comm.Get_rank(),
            for milliseconds in  [event_done.time_since(start) for event_done in \
                            (start2, pad_done, sync_done, comp_done, all_done)]:
                print "%1.4f " % milliseconds, 
            print cfg['block_shape']

            return comp_done.time_since(start) # Return time needed to execute the function.
Пример #4
0
        def execute(cfg, *args, **kwargs):

            # Parse keyword arguments.
            post_sync_grids = kwargs.get('post_sync', None)

            # Parse the inputs.
            gpu_params = []
            for k in range(len(params)):
                if params[k]['gce_type'] is 'number':
                    gpu_params.append(params[k]['dtype'](args[k]))
                elif params[k]['gce_type'] is 'const':  # Load Const.
                    gpu_params.append(args[k].data.ptr)
                    # Const no longer actually "const" in cuda code.
#                     d_ptr, size_in_bytes = my_get_global(params[k]['name'])
#                     drv.memcpy_dtod(d_ptr, args[k].data.gpudata, size_in_bytes)
                elif params[k]['gce_type'] is 'grid':
                    if args[k]._xlap is 0:
                        gpu_params.append(args[k].data.ptr)
                    else:
                        gpu_params.append(args[k].data.ptr + \
                                            args[k]._xlap_offset)
                elif params[k]['gce_type'] is 'out':
                    args[k].data.fill(args[k].dtype(0))  # Initialize the Out.
                    gpu_params.append(args[k].data.ptr)
                else:
                    raise TypeError('Invalid input type.')

            # See if we need to synchronize grids after kernel execution.
            if post_sync_grids is None:
                sync_pad = 0
            else:
                sync_pad = max([g._xlap for g in post_sync_grids])

            start2.record(stream)
            comm.Barrier()
            start.record(stream)

            # Execute kernel in padded regions first.
            execute_range(x_start, x_start + sync_pad, gpu_params, cfg, stream)
            execute_range(x_end - sync_pad, x_end, gpu_params, cfg, stream)
            pad_done.record(stream)  # Just for timing purposes.
            stream.synchronize()  # Wait for execution to finish.

            # Begin kernel execution in remaining "core" region.
            execute_range(x_start + sync_pad, x_end - sync_pad, gpu_params,
                          cfg, stream)
            comp_done.record(stream)  # Timing only.

            # While core kernel is executing, perform synchronization.
            if post_sync_grids is not None:  # Synchronization needed.
                for grid in post_sync_grids:
                    grid.synchronize_start()  # Start synchronization.

                # Keep on checking until everything is done.
                while not (all([grid.synchronize_isdone() \
                                for grid in post_sync_grids]) and \
                        stream.is_done()):
                    pass

            else:  # Nothing to synchronize.
                stream.synchronize()  # Just wait for execution to finish.

            sync_done.record()  # Timing.

            # Obtain the result for all Outs.
            batch_reduce(*[args[k] for k in range(len(params)) \
                                if params[k]['gce_type'] is 'out'])
            all_done.record()  # Timing.
            all_done.synchronize()

            # The delay between sync_done and comp_done should be small.
            # Otherwise, the parallelization efficiency is suffering.
            print "(%d)" % comm.Get_rank(),
            for milliseconds in  [event_done.time_since(start) for event_done in \
                            (start2, pad_done, sync_done, comp_done, all_done)]:
                print "%1.4f " % milliseconds,
            print cfg['block_shape']

            return comp_done.time_since(
                start)  # Return time needed to execute the function.