def _test_async_queue_read(async_queue, cuda_device): ref_files = [] for i in range(async_queue): f, _ = _do_ref_write(tmpdir, i) ref_files.append(f) aio_buffers = [] for i in range(async_queue): if cuda_device: buf = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda') else: buf = torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu').pin_memory() aio_buffers.append(buf) single_submit = True overlap_events = True h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) _validate_handle_state(h, single_submit, overlap_events) for i in range(async_queue): read_status = h.async_pread(aio_buffers[i], ref_files[i]) assert read_status == 0 wait_status = h.wait() assert wait_status == async_queue for i in range(async_queue): with open(ref_files[i], 'rb') as f: ref_buffer = list(f.read()) assert ref_buffer == aio_buffers[i].tolist()
def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_numel, device, dtype, timers): super(PartitionedOptimizerSwapper, self).__init__(swap_config, aio_config, base_folder, optimizer, largest_numel, device, dtype, timers) aio_op = AsyncIOBuilder().load() self.aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH], aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS], aio_config[AIO_THREAD_COUNT]) # Overlap swapping out self.gradient_swapper = AsyncTensorSwapper( aio_handle=self.aio_handle, numel_alignment=self.numel_alignment, timers=self.timers) self.print_exclude_list += [ 'aio_handle', 'gradient_swapper', 'print_exclude_list' ] if torch.distributed.get_rank() == 0: print_object(obj=self, name='PartitionedOptimizerSwapper', exclude_list=self.print_exclude_list)
def _test_async_read(single_submit, overlap_events, cuda_device): ref_file, _ = _do_ref_write(tmpdir) if cuda_device: aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda') else: aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu').pin_memory() h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) _validate_handle_state(h, single_submit, overlap_events) read_status = h.async_pread(aio_buffer, ref_file) assert read_status == 0 wait_status = h.wait() assert wait_status == 1 with open(ref_file, 'rb') as f: ref_buffer = list(f.read()) assert ref_buffer == aio_buffer.tolist()
def pre_handle(args, tid, read_op): io_string = "Read" if read_op else "Write" num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size file = args.read_file if read_op else f'{args.write_file}.{tid}' task_log(tid, f'Allocate tensor of size {num_bytes} bytes') if args.gpu: buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda') else: buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory() task_log( tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}' ) io_parallel = args.io_parallel if args.io_parallel else 1 handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, args.overlap_events, io_parallel) task_log(tid, f'created deepspeed aio handle') ctxt = {} ctxt['file'] = file ctxt['num_bytes'] = num_bytes ctxt['handle'] = handle ctxt['buffer'] = buffer ctxt['elapsed_sec'] = 0 return ctxt
def _test_parallel_write(single_submit, overlap_events): ref_file, ref_buffer = _do_ref_write(tmpdir) aio_file, aio_buffer = _get_test_file_and_buffer( tmpdir, ref_buffer, False) h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) _validate_handle_state(h, single_submit, overlap_events) write_status = h.sync_pwrite(aio_buffer, aio_file) assert write_status == 1 assert os.path.isfile(aio_file) filecmp.clear_cache() assert filecmp.cmp(ref_file, aio_file, shallow=False)
def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_numel, device, dtype, timers): super(PipelinedOptimizerSwapper, self).__init__(swap_config, aio_config, base_folder, optimizer, largest_numel, device, dtype, timers) aio_op = AsyncIOBuilder().load() self.write_aio_handle = aio_op.aio_handle( aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH], aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS], aio_config[AIO_THREAD_COUNT]) self.read_aio_handle = aio_op.aio_handle( aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH], aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS], aio_config[AIO_THREAD_COUNT]) # Overlap gradient swap out self.gradient_swapper = AsyncTensorSwapper( aio_handle=self.write_aio_handle, numel_alignment=self.numel_alignment, timers=self.timers) self.async_swap_in = swap_config[OFFLOAD_OPTIMIZER_PIPELINE_READ] self.async_swap_out = swap_config[OFFLOAD_OPTIMIZER_PIPELINE_WRITE] self.swap_ops = { SYNC_SWAP_IN: None, ASYNC_SWAP_IN: None, SYNC_SWAP_OUT: None, ASYNC_SWAP_OUT: None } self.print_exclude_list += [ 'gradient_swapper', 'read_aio_handle', 'write_aio_handle', 'swap_ops', 'print_exclude_list' ] if torch.distributed.get_rank() == 0: print_object(obj=self, name='PipelinedOptimizerSwapper', exclude_list=self.print_exclude_list)
def main_basic_write(pool_params): args, tid, ctxt = pool_params start_time = time.time() AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth, args.single_submit, args.overlap_events, args.validate) end_time = time.time() ctxt['elapsed_sec'] += end_time - start_time return ctxt
def _test_async_queue_write(async_queue, cuda_device): ref_files = [] ref_buffers = [] for i in range(async_queue): f, buf = _do_ref_write(tmpdir, i) ref_files.append(f) ref_buffers.append(buf) aio_files = [] aio_buffers = [] for i in range(async_queue): f, buf = _get_test_file_and_buffer(tmpdir, ref_buffers[i], cuda_device, i) aio_files.append(f) aio_buffers.append(buf) single_submit = True overlap_events = True h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) _validate_handle_state(h, single_submit, overlap_events) for i in range(async_queue): read_status = h.async_pwrite(aio_buffers[i], aio_files[i]) assert read_status == 0 wait_status = h.wait() assert wait_status == async_queue for i in range(async_queue): assert os.path.isfile(aio_files[i]) filecmp.clear_cache() assert filecmp.cmp(ref_files[i], aio_files[i], shallow=False)
def __init__(self, ds_config, model_dtype): aio_op = AsyncIOBuilder().load(verbose=False) self.aio_handle = aio_op.aio_handle self.dtype = model_dtype #set swap buffers, create aio handles self._configure_aio(ds_config) #mapping from param id to path self.id_to_path = {} #mapping from pram_id to buffer id self.param_id_to_buffer_id = {} # mapping from param_id to swap buffer self.param_id_to_swap_buffer = {} #number of elements in the param self.param_id_to_numel = {} self.pending_writes = 0 self.pending_reads = 0 #keep track of async swap in params and buffers self.inflight_params = [] self.inflight_swap_in_buffers = [] self.inflight_numel = 0 #keep track of available params self.available_params = set() self.available_numel = 0 # for swapping out from partitioned fp32 params self.partitioned_swap_buffer = None self.partitioned_swap_pool = None self.invalid_buffer = torch.tensor(1).half() if dist.get_rank() == 0: exclude_list = ['aio_read_handle', 'aio_write_handle', 'buffers'] print_object(obj=self, name='AsyncPartitionedParameterSwapper', exclude_list=exclude_list)
import deepspeed from deepspeed.ops.aio import AsyncIOBuilder assert AsyncIOBuilder().is_compatible()
def async_io_setup(): from deepspeed.ops.aio import AsyncIOBuilder return AsyncIOBuilder().is_compatible()