def to_map(obj): if isinstance(obj, torch.Tensor): if obj.device == torch.device("cuda", target_gpu): return (obj, ) if not use_side_stream_for_tensor_copies: return (obj.to(target_gpu), ) else: # Perform CPU -> GPU copies in a background stream. This code is # motivated from similar logic in torch/nn/parallel/_functions.py stream = _get_stream(target_gpu) with torch.cuda.stream(stream): output = obj.to(target_gpu) # synchronize with the copy stream with torch.cuda.device(target_gpu): current_stream = torch.cuda.current_stream() # Sync the current stream with the copy stream current_stream.wait_stream(stream) # Ensure tensor memory is not reused until work on # main stream is complete output.record_stream( current_stream) # type: ignore[arg-type] return (output, ) if _is_namedtuple(obj): return [type(obj)(*args) for args in zip(*map(to_map, obj))] if isinstance(obj, tuple) and len(obj) > 0: return list(zip(*map(to_map, obj))) if isinstance(obj, list) and len(obj) > 0: return [list(i) for i in zip(*map(to_map, obj))] if isinstance(obj, dict) and len(obj) > 0: return [type(obj)(i) for i in zip(*map(to_map, obj.items()))] return [obj]
def forward(target_gpus, input): input_device = get_input_device(input) streams = None if input_device == -1 and target_gpus != [-1]: # Perform CPU to GPU copies in a background stream streams = [_get_stream(device) for device in target_gpus] outputs = scatter(input, target_gpus, streams) # Synchronize with the copy stream if streams is not None: synchronize_stream(outputs, target_gpus, streams) return tuple(outputs)
def to_map(obj): if isinstance(obj, torch.Tensor): if obj.device == torch.device("cuda", target_gpu): return (obj, ) if not use_side_stream_for_tensor_copies: return (obj.to(target_gpu), ) else: # Perform CPU -> GPU copies in a background stream. This code is # motivated from similar logic in torch/nn/parallel/_functions.py stream = _get_stream(target_gpu) with torch.cuda.stream(stream): output = obj.to(target_gpu) # synchronize with the copy stream with torch.cuda.device(target_gpu): current_stream = torch.cuda.current_stream() # Sync the current stream with the copy stream current_stream.wait_stream(stream) # Ensure tensor memory is not reused until work on # main stream is complete output.record_stream( current_stream) # type: ignore[arg-type] return (output, ) if _is_namedtuple(obj): return [type(obj)(*args) for args in zip(*map(to_map, obj))] if isinstance(obj, tuple) and len(obj) > 0: return list(zip(*map(to_map, obj))) if isinstance(obj, str): # Needs to be checked, otherwise it's taken as a sequence infinitely. # This is because the elements of a string are also strings, and so on. return [obj] if isinstance(obj, collections.abc.Sequence) and len(obj) > 0: try: return [type(obj)(i) for i in zip(*map(to_map, obj)) ] # type: ignore[call-arg] except TypeError: # The sequence type may not support `__init__(iterable)` (e.g., `range`). return [list(i) for i in zip(*map(to_map, obj))] if isinstance(obj, collections.abc.Mapping) and len(obj) > 0: try: return [type(obj)(i) for i in zip(*map(to_map, obj.items())) ] # type: ignore[call-arg] except TypeError: # The mapping type may not support `__init__(iterable)`. return [dict(i) for i in zip(*map(to_map, obj.items()))] return [obj]
def forward(target_gpus, input): """[summary] Args: target_gpus ([type]): [description] input ([type]): [description] Returns: [type]: [description] """ input_device = get_input_device(input) streams = None if input_device == -1: # Perform CPU to GPU copies in a background stream streams = [_get_stream(device) for device in target_gpus] outputs = scatter(input, target_gpus, streams) # Synchronize with the copy stream if streams is not None: synchronize_stream(outputs, target_gpus, streams) return tuple(outputs)