def __init__(self, params, optim, group=None, broadcast_fp16=False, offload=False, device="gpu", **kw): super().__init__(optim._learning_rate, params, kw) # Segmentation information self._dtype_rank_params = OrderedDict( ) # {dtype:[param1,param2]} device, rank, params self._param2rank = {} self.__segment_params = [] self._rank_buffer_size = {} # {dtype: {rank: numel+alignment}} self._param2align = {} # {param.name: align} # Default information self._optim_defaults = kw self._optim = optim self._ori_parameter_list = self._optim._parameter_list self._ori_param_groups = self._optim._param_groups assert hasattr(self._optim, "_master_weights" ), "Must use optimizer with _master_weights attribute" self._local_params = params self._default_device = device self._pfp16 = len( list( filter(lambda x: x.trainable and x.dtype == Type.fp16.value, self._local_params))) > 0 self.group = dist.new_group(_get_global_group() .ranks) if group is None else group self.world_size = self.group.nranks self.rank = self.group.rank self.broadcast_fp16 = broadcast_fp16 self.param_storages = {} # {dtype: {rank: InternalStorage}} if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm): logging.warning( "While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed." ) self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip, paddle.get_device(), self.group) if offload: assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16" self.offload = offload # Using for offload self.offload_device = "cpu" self.offload_buffer_size = 0 self.offload_param2align = {} self.offload_params = None self.offload_grads = None self._master_params = {} # Update optimizer parameters and adjust parameter storage and use according to rank. self._update_opt_status()
def __init__(self, layer, optimizer, group=None, sync_buffers=False, device="gpu", pertrain_sync_models=True, accumulate_grads=False, offload=False, sync_comm=False): super().__init__() # Default configs assert core.is_compiled_with_cuda(), "Only support CUDA." self._layer = layer self._default_device = device self.__sync_buffers = sync_buffers self._accumulate_grads = accumulate_grads self._offload = offload self._sync_comm = sync_comm # Communication group establishment self._group = dist.new_group( _get_global_group().ranks) if group is None else group self._world_size_scaling = 1.0 / self._group.nranks assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1." self._rank = self._group.rank self._global_root_rank = 0 # picking rank 0 as the reference self._global_ranks = self._group.ranks self._param2buffer_size = dict() # {param.name: size} self._param2buffer = dict( ) # {param.name: [(start0, end0),(start1, end1), ...]} self._trainable_params = dict() # {layer.name: [trainable_params]} assert not isinstance( optimizer, list), "Multiple optimizers are not supported now." self._optim = _OptimizerWrapper(optimizer, self._offload, self._group, self._update_params_slice) self._ori_parameter_list = self._optim._parameter_list self._ori_param_groups = self._optim._param_groups # Replace optimizer's _grad_clip if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm): logging.warning( "While using ClipGradByGlobalNorm in ShardingStage3, the grad clip of original optimizer will be changed." ) self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip, paddle.get_device(), self._group) # Synchronous all ranks models if pertrain_sync_models: self._sync_params_and_buffers() self._segment_rank_params(self._layer) # In the first step, record the execution order of the layer self._order_tracer = OrderedDict() self._order_tracer["order"] = 0 self._order_tracer["layer"] = [] # Register task flow self._task_flow = TaskFlow() # Register forward hooks self._register_forward_hooks(self._layer) # Register backward parameter hooks self._register_backward_hooks() # Redefine optimizer step and clear function self._redefine_opt_step() self._redefine_opt_clear()
def __init__( self, layer, sharding_optimizer, group=None, sync_buffers=False, buffer_max_size=2**23, #8MB auto_refresh_trainable=True, device="gpu"): super().__init__() # training options self._layer = layer self._sharding_optimizers = [ sharding_optimizer ] if not isinstance(sharding_optimizer, list) else sharding_optimizer assert all( list( map(lambda opt: isinstance(opt, GroupShardedOptimizerStage2), self._sharding_optimizers)) ), "Please use GroupShardedOptimizerStage2 optimizer" self._sync_buffers = sync_buffers self._auto_refresh_trainable = auto_refresh_trainable # Communication related attributes self._group = collective.new_group( collective._get_global_group().ranks) if group is None else group self._world_size_scaling = 1.0 / self._group.nranks assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1" self._rank = self._group.rank self._global_root_rank = self._group.ranks[ 0] # picking ranks index 0 as the reference self._default_device = device # Global statistical parameters self._all_params = [] for optim in self._sharding_optimizers: self._all_params.extend(list(optim.local_params)) self._trainable_params = [] self._grad_reduced = [] self._trainable_param2rank = {} self._trainable_param2align = {} self._trainable_mask = list(map(_trainable, self._all_params)) self._param_grads = [] # Set grad storage size & Display param sizes and model sizes model_size = sum([p._numel() for p in self._layer.parameters()]) assert buffer_max_size >= 0, "buffer_max_size must be GE than 0." self._buffer_max_size = self._rank_buffer_size(buffer_max_size, model_size) self._use_grad_storage = buffer_max_size > 0 self._grad_storages = {} # {dtype: {rank: GradStorage}} self._has_grad_storage = [] self._grad_storage_list = [] # Offload # TODO(haohongxiang): Now it's not be supported for multi-optimizers using Offload strategy self._offload_optims = list( filter(lambda optim: optim.offload, self._sharding_optimizers)) if len(self._offload_optims) > 0: assert len( self._sharding_optimizers ) == 1, "Only support offload strategy for single optimizer" self._offload = len(self._offload_optims) > 0 self._offload_device = "cpu" # Set backward pass hooks self._bw_hooks = [] # TODO (Baibaifan) Set tasks flow support asynchronous communicate # self._tasks_flow = deque() # Define optimizer step and clear_grad self._redefine_opt_step() self._redefine_opt_clear()
def __init__( self, layer, sharding_optimizer, group=None, sync_buffers=False, pertrain_sync_models=True, buffer_max_size=2**23, #8MB auto_refresh_trainable=True, device="gpu", use_grad_storage=True, accumulate_grads=False): super().__init__() # training options self._layer = layer self._sharding_optimizers = [sharding_optimizer] if not isinstance( sharding_optimizer, list) else sharding_optimizer assert all( list( map(lambda opt: isinstance(opt, ShardingOptimizerStage2), self._sharding_optimizers)) ), "Please use ShardingOptimizerStage2 optimizer" self._sync_buffers = sync_buffers self._auto_refresh_trainable = auto_refresh_trainable # Gradient accumulation, Gradient flip self._accumulate_grads = accumulate_grads # Communication related attributes self._group = dist.new_group(_get_global_group() .ranks) if group is None else group self._world_size_scaling = 1.0 / self._group.nranks assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1" self._rank = self._group.rank self._global_root_rank = 0 # picking rank 0 as the reference self._default_device = device # Global statistical parameters self._all_params = list( chain(*[optim.local_params for optim in self._sharding_optimizers])) self._trainable_params = [] self._grad_reduced = [] self._trainable_param2rank = {} self._trainable_param2align = {} self._trainable_mask = list(map(_trainable, self._all_params)) self._param_grads = [] # Set grad storage size & Display param sizes and model sizes model_size = sum( [np.prod(p.shape) for p in self._layer.parameters()]).item() self._buffer_max_size = self._rank_buffer_size(buffer_max_size, model_size) self._use_grad_storage = use_grad_storage self._grad_storages = {} # {dtype: {rank: GradStorage}} self._has_grad_storage = [] self._grad_storage_list = [] # Offload # TODO(haohongxiang): Now it's not be supported for multi-optimizers using Offload strategy self._offload_optims = list( filter(lambda optim: optim.offload, self._sharding_optimizers)) if len(self._offload_optims) > 0: assert len( self._sharding_optimizers ) == 1, "Only support offload strategy for single optimizer" self._offload = self._sharding_optimizers[0].offload self._offload_device = "cpu" # Set backward pass hooks self._bw_hooks = [] # Synchronous all ranks models if pertrain_sync_models: self._sync_params_and_buffers() # Set tasks flow self._tasks_flow = deque() # Define optimizer step and clear_grad if self._accumulate_grads: self._redefine_opt_step() self._redefine_opt_clear()
def __init__(self, params, optim, group=None, offload=False, device="gpu", pertrain_sync_models=True, **kw): super().__init__(learning_rate=optim._learning_rate, parameters=params) assert core.is_compiled_with_cuda(), "Only GPU is supported now" # Segmentation information self._dtype_rank_params = OrderedDict( ) # {dtype:[param1,param2]} device, rank, params self._param2rank = {} self.__segment_params = [] self._rank_buffer_size = {} # {dtype: {rank: numel+alignment}} self._param2align = {} # {param.name: align} # Default information self._optim = optim assert hasattr(self._optim, "_master_weights" ), "Must use optimizer with _master_weights attribute" # Support parameter group and parameter list self._local_params = [] if isinstance(params[0], dict): for param_group in params: self._local_params.extend(list(param_group["params"])) else: self._local_params.extend(list(params)) self._default_device = device self._pfp16 = len( list( filter(lambda x: x.trainable and x.dtype == Type.fp16.value, self._local_params))) > 0 self._group = new_group( _get_global_group().ranks) if group is None else group self.world_size = self._group.nranks self._rank = self._group.rank self._global_root_rank = self._group.ranks[0] # Synchronous all ranks models if pertrain_sync_models: self._sync_params_and_buffers() self.param_storages = {} # {dtype: {rank: InternalStorage}} if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm): logging.warning( "While using ClipGradByGlobalNorm in GroupShardedOptimizerStage2, the grad clip of original optimizer will be changed." ) self._optim._grad_clip = GroupShardedClipGrad( self._optim._grad_clip, paddle.get_device(), self._group) if self._optim._parameter_list and isinstance( self._optim._parameter_list[0], dict): for item in self._optim._param_groups: if "grad_clip" in item.keys(): item["grad_clip"] = self._optim._grad_clip if offload: assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16" self.offload = offload # Using for offload self.offload_device = "cpu" self.offload_buffer_size = 0 self.offload_param2align = {} self.offload_params = None self.offload_grads = None self.dev_id = int(paddle.get_device().split(":")[1]) self._master_params = {} # Update optimizer parameters and adjust parameter storage and use according to rank. self._update_opt_status()
def __init__(self, layer, optimizer, group=None, sync_buffers=False, device="gpu", segment_size=2**20, pertrain_sync_models=True, offload=False, sync_comm=False): super().__init__() # Default configs assert core.is_compiled_with_cuda(), "Only support CUDA." self._layer = layer self._default_device = device self.__sync_buffers = sync_buffers self._offload = offload self._sync_comm = sync_comm # segmentation size assert segment_size >= 0, "segment_size must be GE than 0." self._segment_size = segment_size global DEV DEV = "cpu" if paddle.get_device() == "cpu" else paddle.get_device( ).split(":")[0] global DEV_ID DEV_ID = 0 if paddle.get_device() == "cpu" else int( paddle.get_device().split(":")[1]) global param2dtype param2dtype = dict() # Communication group establishment self._group = collective.new_group( collective._get_global_group().ranks) if group is None else group self._world_size_scaling = 1.0 / self._group.nranks assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1." self._rank = self._group.rank self._global_root_rank = self._group.ranks[ 0] # picking ranks index 0 as the reference # Parameter segmentation for global ranks # After flatten -> self._param2buffer_size, self._param2buffer, self._trainable_params self._param2buffer_size = dict() # {param.name: size} self._param2buffer = dict( ) # {param.name: [(start0, end0),(start1, end1), ...]} self._trainable_params = dict() # {id(layer): [trainable_params]} self._unslice_params = set() # param's numel <= segment_size self._unslice_params2align = dict() # {param.name: param's align} self._grad_storages = dict() # {param.dtype: GradStorage} assert not isinstance( optimizer, list), "Multiple optimizers are not supported now." self._optim = _OptimizerWrapper(optimizer, self._offload, self._group, self._update_params_slice) self._ori_parameter_list = self._optim._parameter_list self._ori_param_groups = self._optim._param_groups # Replace optimizer's _grad_clip if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm): logging.warning( "While using ClipGradByGlobalNorm in GroupShardedStage3, the grad clip of original optimizer will be changed." ) self._optim._grad_clip = GroupShardedClipGrad( self._optim._grad_clip, paddle.get_device(), self._group) if self._optim._parameter_list and isinstance( self._optim._parameter_list[0], dict): for item in self._optim._param_groups: if "grad_clip" in item.keys(): item["grad_clip"] = self._optim._grad_clip # Synchronous all ranks models if pertrain_sync_models: self._sync_params_and_buffers() self._segment_rank_params(self._layer) # Add unslice params to master_weight in fp16 self._handle_unslice_params() # In the first step, record the execution order of the layer self._order_tracer = OrderedDict() self._order_tracer["order"] = 0 self._order_tracer["layer"] = list() # Register task flow self._task_flow = TaskFlow() # Register forward hooks self._register_forward_hooks(self._layer) # Register backward parameter hooks self._register_backward_hooks() # Redefine optimizer step and clear function self._redefine_opt_step() self._redefine_opt_clear()