def get_dynamic_loss_scale_args(param_dict): loss_scale_args = None if get_fp16_enabled(param_dict): fp16_dict = param_dict[FP16] dynamic_loss_args = [ FP16_INITIAL_SCALE_POWER, FP16_LOSS_SCALE_WINDOW, FP16_MIN_LOSS_SCALE, FP16_HYSTERESIS ] if any(arg in list(fp16_dict.keys()) for arg in dynamic_loss_args): init_scale = get_scalar_param(fp16_dict, FP16_INITIAL_SCALE_POWER, FP16_INITIAL_SCALE_POWER_DEFAULT) scale_window = get_scalar_param(fp16_dict, FP16_LOSS_SCALE_WINDOW, FP16_LOSS_SCALE_WINDOW_DEFAULT) delayed_shift = get_scalar_param(fp16_dict, FP16_HYSTERESIS, FP16_HYSTERESIS_DEFAULT) min_loss_scale = get_scalar_param(fp16_dict, FP16_MIN_LOSS_SCALE, FP16_MIN_LOSS_SCALE_DEFAULT) loss_scale_args = { INITIAL_LOSS_SCALE: 2**init_scale, SCALE_WINDOW: scale_window, DELAYED_SHIFT: delayed_shift, MIN_LOSS_SCALE: min_loss_scale } return loss_scale_args
def get_allgather_size(param_dict): return get_scalar_param(param_dict, ALLGATHER_SIZE, ALLGATHER_SIZE_DEFAULT) if get_scalar_param( param_dict, ALLGATHER_SIZE, ALLGATHER_SIZE_DEFAULT) > 0 else ALLGATHER_SIZE_DEFAULT
def _initialize(self, zero_config_dict): self.stage = get_scalar_param(zero_config_dict, ZERO_OPTIMIZATION_STAGE, ZERO_OPTIMIZATION_STAGE_DEFAULT) self.contiguous_gradients = get_scalar_param( zero_config_dict, ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS, ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT) self.reduce_bucket_size = get_scalar_param( zero_config_dict, ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE, ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT) self.reduce_scatter = get_scalar_param( zero_config_dict, ZERO_OPTIMIZATION_REDUCE_SCATTER, ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT) self.overlap_comm = get_scalar_param( zero_config_dict, ZERO_OPTIMIZATION_OVERLAP_COMM, ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT) self.allgather_partitions = get_scalar_param( zero_config_dict, ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS, ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT) self.allgather_bucket_size = get_scalar_param( zero_config_dict, ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE, ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT) self.load_from_fp32_weights = get_scalar_param( zero_config_dict, ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS, ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT)
def get_gradient_clipping(param_dict): grad_clip = get_optimizer_gradient_clipping(param_dict) if grad_clip is not None: return grad_clip else: return get_scalar_param(param_dict, GRADIENT_CLIPPING, GRADIENT_CLIPPING_DEFAULT)
def get_loss_scale(param_dict): if get_fp16_enabled(param_dict): return get_scalar_param(param_dict[FP16], FP16_LOSS_SCALE, FP16_LOSS_SCALE_DEFAULT) else: return FP16_LOSS_SCALE_DEFAULT
def get_tensorboard_job_name(param_dict): if get_tensorboard_enabled(param_dict): return get_scalar_param(param_dict[TENSORBOARD], TENSORBOARD_JOB_NAME, TENSORBOARD_JOB_NAME_DEFAULT) else: return TENSORBOARD_JOB_NAME_DEFAULT
def get_tensorboard_enabled(param_dict): if TENSORBOARD in param_dict.keys(): return get_scalar_param(param_dict[TENSORBOARD], TENSORBOARD_ENABLED, TENSORBOARD_ENABLED_DEFAULT) else: return False
def get_tensorboard_output_path(param_dict): if get_tensorboard_enabled(param_dict): return get_scalar_param(param_dict[TENSORBOARD], TENSORBOARD_OUTPUT_PATH, TENSORBOARD_OUTPUT_PATH_DEFAULT) else: return TENSORBOARD_OUTPUT_PATH_DEFAULT
def get_initial_dynamic_scale(param_dict): if get_fp16_enabled(param_dict): initial_scale_power = get_scalar_param( param_dict[FP16], FP16_INITIAL_SCALE_POWER, FP16_INITIAL_SCALE_POWER_DEFAULT) else: initial_scale_power = FP16_INITIAL_SCALE_POWER_DEFAULT return 2**initial_scale_power
def read_zero_config_deprecated(self, param_dict): zero_config_dict = {} zero_config_dict[ ZERO_OPTIMIZATION_STAGE] = 1 if param_dict[ZERO_OPTIMIZATION] else 0 if zero_config_dict[ZERO_OPTIMIZATION_STAGE] > 0: zero_config_dict[ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE] = get_scalar_param( param_dict, ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEPRECATED, ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT) logger.warning( 'DeepSpeedConfig: this format of ZeRO optimization setup is deprecated. Please use the following format: {}' .format(ZERO_FORMAT)) return zero_config_dict
def _initialize(self, act_chkpt_config_dict): self.partition_activations = get_scalar_param( act_chkpt_config_dict, ACT_CHKPT_PARTITION_ACTIVATIONS, ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT) self.contiguous_memory_optimization = get_scalar_param( act_chkpt_config_dict, ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION, ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT) self.cpu_checkpointing = get_scalar_param( act_chkpt_config_dict, ACT_CHKPT_CPU_CHECKPOINTING, ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT) self.number_checkpoints = get_scalar_param( act_chkpt_config_dict, ACT_CHKPT_NUMBER_CHECKPOINTS, ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT) self.profile = get_scalar_param(act_chkpt_config_dict, ACT_CHKPT_PROFILE, ACT_CHKPT_PROFILE_DEFAULT) self.synchronize_checkpoint_boundary = get_scalar_param( act_chkpt_config_dict, ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY, ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT)
def get_sparse_gradients_enabled(param_dict): return get_scalar_param(param_dict, SPARSE_GRADIENTS, SPARSE_GRADIENTS_DEFAULT)
def get_gradient_accumulation_steps(param_dict): return get_scalar_param(param_dict, GRADIENT_ACCUMULATION_STEPS, GRADIENT_ACCUMULATION_STEPS_DEFAULT)
def get_prescale_gradients(param_dict): return get_scalar_param(param_dict, PRESCALE_GRADIENTS, PRESCALE_GRADIENTS_DEFAULT)
def get_allreduce_always_fp32(param_dict): return get_scalar_param(param_dict, FP32_ALLREDUCE, FP32_ALLREDUCE_DEFAULT)
def get_zero_max_elements_per_comm(param_dict): return get_scalar_param(param_dict, ZERO_MAX_ELEMENTS_PER_COMM, ZERO_MAX_ELEMENTS_PER_COMM_DEFAULT)
def get_steps_per_print(param_dict): return get_scalar_param(param_dict, STEPS_PER_PRINT, STEPS_PER_PRINT_DEFAULT)
def get_train_micro_batch_size_per_gpu(param_dict): return get_scalar_param(param_dict, TRAIN_MICRO_BATCH_SIZE_PER_GPU, TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT)
def get_fp16_enabled(param_dict): if FP16 in param_dict.keys(): return get_scalar_param(param_dict[FP16], FP16_ENABLED, FP16_ENABLED_DEFAULT) else: return False
def get_amp_enabled(param_dict): if AMP in param_dict.keys(): return get_scalar_param(param_dict[AMP], AMP_ENABLED, AMP_ENABLED_DEFAULT) else: return False
def get_train_batch_size(param_dict): return get_scalar_param(param_dict, TRAIN_BATCH_SIZE, TRAIN_BATCH_SIZE_DEFAULT)
def get_gradient_clipping(param_dict): return get_scalar_param(param_dict, GRADIENT_CLIPPING, GRADIENT_CLIPPING_DEFAULT)
def get_dump_state(param_dict): return get_scalar_param(param_dict, DUMP_STATE, DUMP_STATE_DEFAULT)
def get_disable_allgather(param_dict): return get_scalar_param(param_dict, DISABLE_ALLGATHER, DISABLE_ALLGATHER_DEFAULT)
def get_zero_optimization(param_dict): return get_scalar_param(param_dict, ZERO_OPTIMIZATION, ZERO_OPTIMIZATION_DEFAULT)
def get_wall_clock_breakdown(param_dict): return get_scalar_param(param_dict, WALL_CLOCK_BREAKDOWN, WALL_CLOCK_BREAKDOWN_DEFAULT)
def get_zero_reduce_scatter(param_dict): return get_scalar_param(param_dict, ZERO_REDUCE_SCATTER, ZERO_REDUCE_SCATTER_DEFAULT)
def get_memory_breakdown(param_dict): return get_scalar_param(param_dict, MEMORY_BREAKDOWN, MEMORY_BREAKDOWN_DEFAULT)
def get_zero_allow_untested_optimizer(param_dict): return get_scalar_param(param_dict, ZERO_ALLOW_UNTESTED_OPTIMIZER, ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT)
def get_gradient_predivide_factor(param_dict): return get_scalar_param(param_dict, GRADIENT_PREDIVIDE_FACTOR, GRADIENT_PREDIVIDE_FACTOR_DEFAULT)