def pre_train_init(self, num_training_steps): """ Collection of all the callback functions called before initializing the training. See ~transformers.Trainer.train() for more details. Args: num_training_steps (:obj:`int`): total number of training which are calculated by number of mini batches per epoch * number of epochs. """ self.state = TrainerState() self.create_optimizer_and_scheduler(num_training_steps) if self.use_min_lr_scheduler is not None: self.lr_scheduler = get_linear_schedule_with_minlr( self.optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps, min_lr=self.min_lr) self.callback_handler.model = self.model self.callback_handler.optimizer = self.optimizer self.callback_handler.lr_scheduler = self.lr_scheduler self.control = self.callback_handler.on_train_begin( self.args, self.state, self.control) self._total_loss_scalar = 0.0 self._globalstep_last_logged = self.state.global_step
def test_reporting(): reports = [] def _fake_report(**kwargs): reports.append(kwargs) with patch("ray.train.report", _fake_report): state = TrainerState() report_callback = TrainReportCallback() report_callback.on_epoch_begin(None, state, None) state.epoch = 0.5 report_callback.on_log(None, state, None, logs={"log1": 1}) state.epoch = 1 report_callback.on_log(None, state, None, logs={"log2": 1}) report_callback.on_epoch_end(None, state, None) report_callback.on_epoch_begin(None, state, None) state.epoch = 1.5 report_callback.on_log(None, state, None, logs={"log1": 1}) state.epoch = 2 report_callback.on_log(None, state, None, logs={"log2": 1}) report_callback.on_epoch_end(None, state, None) report_callback.on_train_end(None, state, None) assert len(reports) == 2 assert "log1" in reports[0] assert "log2" in reports[0] assert reports[0]["epoch"] == 1 assert "log1" in reports[1] assert "log2" in reports[1] assert reports[1]["epoch"] == 2
def train( self, resume_from_checkpoint: Optional[str] = None, trial: Union["optuna.Trial", Dict[str, Any]] = None, **kwargs, ): """ Main training entry point. Args: resume_from_checkpoint (:obj:`str`, `optional`): Local path to a saved checkpoint as saved by a previous instance of :class:`~transformers.Trainer`. If present, training will resume from the model/optimizer/scheduler states loaded here. trial (:obj:`optuna.Trial` or :obj:`Dict[str, Any]`, `optional`): The trial run or the hyperparameter dictionary for hyperparameter search. kwargs: Additional keyword arguments used to hide deprecated arguments """ if "model_path" in kwargs: resume_from_checkpoint = kwargs.pop("model_path") warnings.warn( "`model_path` is deprecated and will be removed in a future version. Use `resume_from_checkpoint` " "instead.", FutureWarning, ) if len(kwargs) > 0: raise TypeError( f"train() received got unexpected keyword arguments: {', '.join(list(kwargs.keys()))}." ) # This might change the seed so needs to run first. self._hp_search_setup(trial) # Model re-init model_reloaded = False if self.model_init is not None: # Seed must be set before instantiating the model when using model_init. set_seed(self.args.seed) self.model = self.call_model_init(trial) model_reloaded = True # Reinitializes optimizer and scheduler self.optimizer, self.lr_scheduler = None, None # Load potential model checkpoint if resume_from_checkpoint is not None and os.path.isfile( os.path.join(resume_from_checkpoint, WEIGHTS_NAME)): logger.info(f"Loading model from {resume_from_checkpoint}).") if isinstance(self.model, PreTrainedModel): self.model = self.model.from_pretrained(resume_from_checkpoint) model_reloaded = True else: state_dict = torch.load( os.path.join(resume_from_checkpoint, WEIGHTS_NAME)) self.model.load_state_dict(state_dict) # If model was re-initialized, put it on the right device and update self.model_wrapped if model_reloaded: if not self.is_model_parallel: self.model = self.model.to(self.args.device) self.model_wrapped = self.model # Keeping track whether we can can len() on the dataset or not train_dataset_is_sized = isinstance(self.train_dataset, collections.abc.Sized) # Data loader and number of training steps train_dataloader = self.get_train_dataloader() # Setting up training control variables: # number of training epochs: num_train_epochs # number of training steps per epoch: num_update_steps_per_epoch # total number of training steps to execute: max_steps if train_dataset_is_sized: num_update_steps_per_epoch = len( train_dataloader) // self.args.gradient_accumulation_steps num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1) if self.args.max_steps > 0: max_steps = self.args.max_steps num_train_epochs = self.args.max_steps // num_update_steps_per_epoch + int( self.args.max_steps % num_update_steps_per_epoch > 0) else: max_steps = math.ceil(self.args.num_train_epochs * num_update_steps_per_epoch) num_train_epochs = math.ceil(self.args.num_train_epochs) else: # see __init__. max_steps is set when the dataset has no __len__ max_steps = self.args.max_steps num_train_epochs = 1 num_update_steps_per_epoch = max_steps if self.args.deepspeed: model, optimizer, lr_scheduler = init_deepspeed( self, num_training_steps=max_steps) self.model = model.module self.model_wrapped = model # will get further wrapped in DDP self.deepspeed = model # DeepSpeedEngine object self.optimizer = optimizer self.lr_scheduler = lr_scheduler else: self.create_optimizer_and_scheduler(num_training_steps=max_steps) self.state = TrainerState() self.state.is_hyper_param_search = trial is not None # Check if saved optimizer or scheduler states exist self._load_optimizer_and_scheduler(resume_from_checkpoint) model = self.model_wrapped # Harold: Removed this apex block # Mixed precision training with apex (torch < 1.6) # Multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Harold: Removed this distributed training block # Distributed training (should be after apex fp16 initialization) # for the rest of this function `model` is the outside model, whether it was wrapped or not if model is not self.model: self.model_wrapped = model # important: at this point: # self.model is the Transformers Model # self.model_wrapped is DDP(Transformers Model), DDP(Deepspeed(Transformers Model)), etc. # Harold: Removed this TPU/Distributed block # Train! world_size = 1 total_train_batch_size = self.args.train_batch_size * self.args.gradient_accumulation_steps * world_size num_examples = (self.num_examples(train_dataloader) if train_dataset_is_sized else total_train_batch_size * self.args.max_steps) logger.info("***** Running training *****") logger.info(f" Num examples = {num_examples}") logger.info(f" Num Epochs = {num_train_epochs}") logger.info( f" Instantaneous batch size per device = {self.args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}" ) logger.info( f" Gradient Accumulation steps = {self.args.gradient_accumulation_steps}" ) logger.info(f" Total optimization steps = {max_steps}") self.state.epoch = 0 start_time = time.time() epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if resume_from_checkpoint is not None and os.path.isfile( os.path.join(resume_from_checkpoint, "trainer_state.json")): self.state = TrainerState.load_from_json( os.path.join(resume_from_checkpoint, "trainer_state.json")) epochs_trained = self.state.global_step // num_update_steps_per_epoch if not self.args.ignore_data_skip: steps_trained_in_current_epoch = self.state.global_step % ( num_update_steps_per_epoch) steps_trained_in_current_epoch *= self.args.gradient_accumulation_steps else: steps_trained_in_current_epoch = 0 logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(f" Continuing training from epoch {epochs_trained}") logger.info( f" Continuing training from global step {self.state.global_step}" ) if not self.args.ignore_data_skip: logger.info( f" Will skip the first {epochs_trained} epochs then the first {steps_trained_in_current_epoch} " "batches in the first epoch.") # Update the references self.callback_handler.model = self.model self.callback_handler.optimizer = self.optimizer self.callback_handler.lr_scheduler = self.lr_scheduler self.callback_handler.train_dataloader = train_dataloader self.state.trial_name = self.hp_name( trial) if self.hp_name is not None else None self.state.trial_params = hp_params( trial) if trial is not None else None # This should be the same if the state has been saved but in case the training arguments changed, it's safer # to set this after the load. self.state.max_steps = max_steps self.state.num_train_epochs = num_train_epochs self.state.is_local_process_zero = self.is_local_process_zero() self.state.is_world_process_zero = self.is_world_process_zero() # tr_loss is a tensor to avoid synchronization of TPUs through .item() tr_loss = torch.tensor(0.0).to(self.args.device) # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses self._total_loss_scalar = 0.0 self._globalstep_last_logged = self.state.global_step self._total_flos = self.state.total_flos model.zero_grad() self.control = self.callback_handler.on_train_begin( self.args, self.state, self.control) # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. if not self.args.ignore_data_skip: for epoch in range(epochs_trained): # We just need to begin an iteration to create the randomization of the sampler. for _ in train_dataloader: break for epoch in range(epochs_trained, num_train_epochs): if isinstance(train_dataloader, DataLoader) and isinstance( train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) # Harold: Removed TPU support epoch_iterator = train_dataloader # Reset the past mems state at the beginning of each epoch if necessary. if self.args.past_index >= 0: self._past = None steps_in_epoch = (len(epoch_iterator) if train_dataset_is_sized else self.args.max_steps * self.args.gradient_accumulation_steps) self.control = self.callback_handler.on_epoch_begin( self.args, self.state, self.control) for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue if (step + 1) % self.args.gradient_accumulation_steps == 0: self.control = self.callback_handler.on_step_begin( self.args, self.state, self.control) if ((step + 1) % self.args.gradient_accumulation_steps != 0) and self.args.local_rank != -1: # Avoid unnecessary DDP synchronization since there will be no backward pass on this example. with model.no_sync(): tr_loss += self.training_step(model, inputs) else: tr_loss += self.training_step(model, inputs) self._total_flos += self.floating_point_ops(inputs) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps steps_in_epoch <= self.args.gradient_accumulation_steps and (step + 1) == steps_in_epoch): # Gradient clipping if self.args.max_grad_norm is not None and self.args.max_grad_norm > 0 and not self.deepspeed: # deepspeed does its own clipping # Harold: Removed AMP integration if hasattr(self.optimizer, "clip_grad_norm"): # Some optimizers (like the sharded optimizer) have a specific way to do gradient clipping self.optimizer.clip_grad_norm( self.args.max_grad_norm) # Harold: Reverted to vanilla steps # Optimizer step self.optimizer.step() self.lr_scheduler.step() model.zero_grad() self.state.global_step += 1 self.state.epoch = epoch + (step + 1) / steps_in_epoch self.control = self.callback_handler.on_step_end( self.args, self.state, self.control) self._maybe_log_save_evaluate(tr_loss, model, trial, epoch) if self.control.should_epoch_stop or self.control.should_training_stop: break self.control = self.callback_handler.on_epoch_end( self.args, self.state, self.control) self._maybe_log_save_evaluate(tr_loss, model, trial, epoch) # Harold: Removed TPU support if self.args.tpu_metrics_debug or self.args.debug: logger.warning( "You enabled PyTorch/XLA debug metrics but you don't have a TPU " "configured. Check your training configuration if this is unexpected." ) if self.control.should_training_stop: break if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of training delattr(self, "_past") logger.info( "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n" ) if self.args.load_best_model_at_end and self.state.best_model_checkpoint is not None: logger.info( f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." ) if isinstance(self.model, PreTrainedModel): self.model = self.model.from_pretrained( self.state.best_model_checkpoint) if not self.is_model_parallel: self.model = self.model.to(self.args.device) else: state_dict = torch.load( os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME)) self.model.load_state_dict(state_dict) if self.deepspeed: self.deepspeed.load_checkpoint( self.state.best_model_checkpoint, load_optimizer_states=False, load_lr_scheduler_states=False) metrics = speed_metrics("train", start_time, self.state.max_steps) if self._total_flos is not None: self.store_flos() metrics["total_flos"] = self.state.total_flos self.log(metrics) self.control = self.callback_handler.on_train_end( self.args, self.state, self.control) # add remaining tr_loss self._total_loss_scalar += tr_loss.item() return TrainOutput(self.state.global_step, self._total_loss_scalar / self.state.global_step, metrics)
def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", Dict[str, Any]] = None): """ Main training entry point. Args: model_path (:obj:`str`, `optional`): Local path to the model if the model to train has been instantiated from a local path. If present, training will resume from the optimizer/scheduler states loaded here. trial (:obj:`optuna.Trial` or :obj:`Dict[str, Any]`, `optional`): The trial run or the hyperparameter dictionary for hyperparameter search. The main difference between ours and Huggingface's original implementation is that we also load model_args when reloading best checkpoints for evaluation. """ # This might change the seed so needs to run first. self._hp_search_setup(trial) # Model re-init if self.model_init is not None: # Seed must be set before instantiating the model when using model_init. set_seed(self.args.seed) model = self.call_model_init(trial) if not self.is_model_parallel: model = model.to(self.args.device) self.model = model self.model_wrapped = model # Reinitializes optimizer and scheduler self.optimizer, self.lr_scheduler = None, None # Keeping track whether we can can len() on the dataset or not train_dataset_is_sized = isinstance(self.train_dataset, collections.abc.Sized) # Data loader and number of training steps train_dataloader = self.get_train_dataloader() # Setting up training control variables: # number of training epochs: num_train_epochs # number of training steps per epoch: num_update_steps_per_epoch # total number of training steps to execute: max_steps if train_dataset_is_sized: num_update_steps_per_epoch = len(train_dataloader) // self.args.gradient_accumulation_steps num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1) if self.args.max_steps > 0: max_steps = self.args.max_steps num_train_epochs = self.args.max_steps // num_update_steps_per_epoch + int( self.args.max_steps % num_update_steps_per_epoch > 0 ) else: max_steps = math.ceil(self.args.num_train_epochs * num_update_steps_per_epoch) num_train_epochs = math.ceil(self.args.num_train_epochs) else: # see __init__. max_steps is set when the dataset has no __len__ max_steps = self.args.max_steps num_train_epochs = 1 num_update_steps_per_epoch = max_steps if self.args.deepspeed: model, optimizer, lr_scheduler = init_deepspeed(self, num_training_steps=max_steps) self.model = model.module self.model_wrapped = model # will get further wrapped in DDP self.deepspeed = model # DeepSpeedEngine object self.optimizer = optimizer self.lr_scheduler = lr_scheduler else: self.create_optimizer_and_scheduler(num_training_steps=max_steps) self.state = TrainerState() self.state.is_hyper_param_search = trial is not None # Check if saved optimizer or scheduler states exist self._load_optimizer_and_scheduler(model_path) model = self.model_wrapped # Mixed precision training with apex (torch < 1.6) if self.use_apex: model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level) # Multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.sharded_dpp: model = ShardedDDP(model, self.optimizer) elif self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=( not getattr(model.config, "gradient_checkpointing", False) if isinstance(model, PreTrainedModel) else True ), ) # find_unused_parameters breaks checkpointing as per # https://github.com/huggingface/transformers/pull/4659#issuecomment-643356021 # for the rest of this function `model` is the outside model, whether it was wrapped or not if model is not self.model: self.model_wrapped = model # important: at this point: # self.model is the Transformers Model # self.model_wrapped is DDP(Transformers Model), DDP(Deepspeed(Transformers Model)), etc. # Train! if is_torch_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size() else: total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1) ) num_examples = ( self.num_examples(train_dataloader) if train_dataset_is_sized else total_train_batch_size * self.args.max_steps ) logger.info("***** Running training *****") logger.info(f" Num examples = {num_examples}") logger.info(f" Num Epochs = {num_train_epochs}") logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}") logger.info(f" Gradient Accumulation steps = {self.args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {max_steps}") self.state.epoch = 0 start_time = time.time() epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path and os.path.isfile(os.path.join(model_path, "trainer_state.json")): self.state = TrainerState.load_from_json(os.path.join(model_path, "trainer_state.json")) epochs_trained = self.state.global_step // num_update_steps_per_epoch if not self.args.ignore_data_skip: steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch) steps_trained_in_current_epoch *= self.args.gradient_accumulation_steps else: steps_trained_in_current_epoch = 0 logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(f" Continuing training from epoch {epochs_trained}") logger.info(f" Continuing training from global step {self.state.global_step}") if not self.args.ignore_data_skip: logger.info( f" Will skip the first {epochs_trained} epochs then the first {steps_trained_in_current_epoch} " "batches in the first epoch." ) # Update the references self.callback_handler.model = self.model self.callback_handler.optimizer = self.optimizer self.callback_handler.lr_scheduler = self.lr_scheduler self.callback_handler.train_dataloader = train_dataloader self.state.trial_name = self.hp_name(trial) if self.hp_name is not None else None self.state.trial_params = hp_params(trial) if trial is not None else None # This should be the same if the state has been saved but in case the training arguments changed, it's safer # to set this after the load. self.state.max_steps = max_steps self.state.num_train_epochs = num_train_epochs self.state.is_local_process_zero = self.is_local_process_zero() self.state.is_world_process_zero = self.is_world_process_zero() # tr_loss is a tensor to avoid synchronization of TPUs through .item() tr_loss = torch.tensor(0.0).to(self.args.device) # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses self._total_loss_scalar = 0.0 self._globalstep_last_logged = 0 self._total_flos = self.state.total_flos model.zero_grad() self.control = self.callback_handler.on_train_begin(self.args, self.state, self.control) # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. if not self.args.ignore_data_skip: for epoch in range(epochs_trained): # We just need to begin an iteration to create the randomization of the sampler. for _ in train_dataloader: break for epoch in range(epochs_trained, num_train_epochs): if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) epoch_iterator = train_dataloader # Reset the past mems state at the beginning of each epoch if necessary. if self.args.past_index >= 0: self._past = None steps_in_epoch = len(train_dataloader) if train_dataset_is_sized else self.args.max_steps self.control = self.callback_handler.on_epoch_begin(self.args, self.state, self.control) assert train_dataset_is_sized, "currently we only support sized dataloader!" inputs = None last_inputs = None for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue if (step + 1) % self.args.gradient_accumulation_steps == 0: self.control = self.callback_handler.on_step_begin(self.args, self.state, self.control) if ((step + 1) % self.args.gradient_accumulation_steps != 0) and self.args.local_rank != -1: # Avoid unnecessary DDP synchronization since there will be no backward pass on this example. with model.no_sync(): tr_loss += self.training_step(model, inputs) else: tr_loss += self.training_step(model, inputs) self._total_flos += self.floating_point_ops(inputs) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps steps_in_epoch <= self.args.gradient_accumulation_steps and (step + 1) == steps_in_epoch ): # Gradient clipping if self.args.max_grad_norm is not None and self.args.max_grad_norm > 0 and not self.deepspeed: # deepspeed does its own clipping if self.use_amp: # AMP: gradients need unscaling self.scaler.unscale_(self.optimizer) if hasattr(self.optimizer, "clip_grad_norm"): # Some optimizers (like the sharded optimizer) have a specific way to do gradient clipping self.optimizer.clip_grad_norm(self.args.max_grad_norm) else: # Revert to normal clipping otherwise, handling Apex or full precision torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer) if self.use_apex else model.parameters(), self.args.max_grad_norm, ) # Optimizer step if is_torch_tpu_available(): xm.optimizer_step(self.optimizer) elif self.use_amp: self.scaler.step(self.optimizer) self.scaler.update() else: self.optimizer.step() self.lr_scheduler.step() model.zero_grad() self.state.global_step += 1 self.state.epoch = epoch + (step + 1) / steps_in_epoch self.control = self.callback_handler.on_step_end(self.args, self.state, self.control) self._maybe_log_save_evaluate(tr_loss, model, trial, epoch) if self.control.should_epoch_stop or self.control.should_training_stop: break self.control = self.callback_handler.on_epoch_end(self.args, self.state, self.control) self._maybe_log_save_evaluate(tr_loss, model, trial, epoch) if self.args.tpu_metrics_debug or self.args.debug: if is_torch_tpu_available(): # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) else: logger.warning( "You enabled PyTorch/XLA debug metrics but you don't have a TPU " "configured. Check your training configuration if this is unexpected." ) if self.control.should_training_stop: break if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of training delattr(self, "_past") logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") if self.args.load_best_model_at_end and self.state.best_model_checkpoint is not None: logger.info( f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." ) if isinstance(self.model, PreTrainedModel): self.model = self.model.from_pretrained(self.state.best_model_checkpoint, model_args=self.model_args) if not self.is_model_parallel: self.model = self.model.to(self.args.device) else: state_dict = torch.load(os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME)) self.model.load_state_dict(state_dict) if self.deepspeed: self.deepspeed.load_checkpoint( self.state.best_model_checkpoint, load_optimizer_states=False, load_lr_scheduler_states=False ) metrics = speed_metrics("train", start_time, self.state.max_steps) if self._total_flos is not None: self.store_flos() metrics["total_flos"] = self.state.total_flos self.log(metrics) self.control = self.callback_handler.on_train_end(self.args, self.state, self.control) # add remaining tr_loss self._total_loss_scalar += tr_loss.item() return TrainOutput(self.state.global_step, self._total_loss_scalar / self.state.global_step, metrics)