def _run_epoch( self, stage, epoch, data_iter, model, metric_reporter, pre_batch=lambda: None, backprop=lambda loss: None, rank=0, num_samples_to_log_progress=1000, ): print(f"Rank {rank} worker: Running epoch #{epoch} for {stage}") report_metric = stage != Stage.TRAIN or self.config.report_train_metrics for batch_id, (inputs, targets, context) in enumerate(data_iter): pre_batch() # pass context to model to use in forward call if needed model.contextualize(context) with time_utils.time("model.forward"): logits = model(*inputs) with time_utils.time("compute loss"): loss = model.get_loss(logits, targets, context) if BatchContext.IGNORE_LOSS in context: loss *= 0 with time_utils.time("backprop"): backprop(loss) if report_metric: with time_utils.time("add metrics"): preds, scores = model.get_pred(logits, targets, context, stage, *inputs) metric_reporter.add_batch_stats(batch_id, preds, targets, scores, loss.item(), inputs, **context) if rank == 0 and (batch_id + 1) % num_samples_to_log_progress == 0: print( f"Epoch {epoch}: finished training {batch_id + 1} samples.", flush=True, ) metrics = None if report_metric: with time_utils.time("report metrics"): metrics = metric_reporter.report_metric( model, stage, epoch, print_to_channels=(rank == 0)) else: metric_reporter._reset() return metrics
def training_backprop(loss): with time_utils.time("loss.backward"): precision_utils.backward(self.optimizer, loss) if world_size > 1: # DDP fix when some parameters don't receive grads for p in model.parameters(): if p.requires_grad and p.grad is None: p.backward(torch.zeros_like(p.data)) if self.lr_scheduler: self.lr_scheduler.step_batch() if self.config.max_clip_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), self.config.max_clip_norm) else: grad_norm = None with time_utils.time("optimizer.step"): self.optimizer.step() # grad_norm could be used to check grads sync in distributed training return grad_norm
def _run_epoch( self, stage: Stage, epoch: int, batches, model: Model, metric_reporter: MetricReporter, pre_batch=lambda: None, backprop=lambda loss: None, rank=0, num_samples_to_log_progress: int = None, ): """Our run_epoch is a bit different, because we're wrapping the model forward call with model.train_batch, which arranges tensors and gets loss, etc.""" print(f"Rank {rank} worker: Running epoch #{epoch} for {stage}") report_metric = stage != Stage.TRAIN or self.config.report_train_metrics for batch_id, batch in enumerate(batches): pre_batch() with time_utils.time("model.train_batch"): loss, metric_data = model.train_batch(batch) with time_utils.time("backprop"): backprop(loss) if report_metric: with time_utils.time("add metrics"): metric_reporter.add_batch_stats( batch_id, *metric_data, **metric_reporter.batch_context(batch)) metrics = None if report_metric: with time_utils.time("report metrics"): metrics = metric_reporter.report_metric( model, stage, epoch, print_to_channels=(rank == 0)) else: metric_reporter._reset() return metrics
def train( self, train_iter: BatchIterator, eval_iter: BatchIterator, model: Model, metric_reporter: MetricReporter, train_config: PyTextConfig, rank: int = 0, ) -> Tuple[torch.nn.Module, Any]: """ Train and eval a model, the model states will be modified. This function iterates epochs specified in config, and for each epoch do: 1. Train model using training data, aggregate and report training results 2. Adjust learning rate if scheduler is specified 3. Evaluate model using evaluation data 4. Calculate metrics based on evaluation results and select best model Args: train_iter (BatchIterator): batch iterator of training data eval_iter (BatchIterator): batch iterator of evaluation data model (Model): model to be trained metric_reporter (MetricReporter): compute metric based on training output and report results to console, file.. etc train_config (PyTextConfig): training config training_result (Optional): only meaningful for Hogwild training. default is None rank (int): only used in distributed training, the rank of the current training thread, evaluation will only be done in rank 0 Returns: model, best_metric: the trained model together with the best metric """ with time_utils.time("pre-training"): world_size = 1 if cuda_utils.CUDA_ENABLED: model = model.cuda() world_size = cuda_utils.DISTRIBUTED_WORLD_SIZE if world_size > 1: device_id = torch.cuda.current_device() model = DistributedModel( module=model, device_ids=[device_id], output_device=device_id, broadcast_buffers=False, ) best_metric = None last_best_epoch = 0 if self.lr_scheduler: self.lr_scheduler.prepare(train_iter, self.config.epochs) self.optimizer = precision_utils.wrap_optimizer(self.optimizer) def training_pre_batch_callback(): if world_size > 1: # replace optimizer.zero_grad() here to work with DDP # in cases where some parameters don't receive grads at each step # loss.backward will set grad for params in the computation graph # we can thus follow which params are left out and call .backward # on them manually for p in model.parameters(): if p.grad is not None: p.grad.detach_() p.grad = None else: self.optimizer.zero_grad() def training_backprop(loss): with time_utils.time("loss.backward"): precision_utils.backward(self.optimizer, loss) if world_size > 1: # DDP fix when some parameters don't receive grads for p in model.parameters(): if p.requires_grad and p.grad is None: p.backward(torch.zeros_like(p.data)) if self.lr_scheduler: self.lr_scheduler.step_batch() if self.config.max_clip_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), self.config.max_clip_norm) else: grad_norm = None with time_utils.time("optimizer.step"): self.optimizer.step() # grad_norm could be used to check grads sync in distributed training return grad_norm time_start = time.time() best_model_state = None for epoch in range(1, self.config.epochs + 1): sys.stdout.flush() if self.config.target_time_limit_seconds > 0 and epoch > 1: time_elapsed = time.time() - time_start mean_epoch_time = time_elapsed / float(epoch - 1) expected_next_epoch_time = time_elapsed + mean_epoch_time if expected_next_epoch_time > self.config.target_time_limit_seconds: print( f"Training stopped after {epoch - 1} epochs and " f"{int(time_elapsed)} seconds, due to the target max training " f"time of {self.config.target_time_limit_seconds} seconds." ) break print(f"Rank {rank} worker: Starting epoch #{epoch}") model.train() lrs = (str(lr) for lr in learning_rates(self.optimizer)) print(f"Learning rate(s): {', '.join(lrs)}") with time_utils.time("epoch train"): self._run_epoch( Stage.TRAIN, epoch, train_iter, model, metric_reporter, pre_batch=training_pre_batch_callback, backprop=training_backprop, rank=rank, num_samples_to_log_progress=self.config. num_samples_to_log_progress, ) if not self.config.do_eval: continue with time_utils.time("epoch eval"): model.eval(Stage.EVAL) with torch.no_grad(): eval_metric = self._run_epoch( Stage.EVAL, epoch, eval_iter, model, metric_reporter, rank=rank, num_samples_to_log_progress=( self.config.num_samples_to_log_progress), ) # Step the learning rate scheduler(s) if self.lr_scheduler: assert eval_metric is not None self.lr_scheduler.step_epoch( metrics=metric_reporter.get_model_select_metric( eval_metric), epoch=epoch, ) # choose best model. if metric_reporter.compare_metric(eval_metric, best_metric): with time_utils.time("save checkpoint model"): last_best_epoch = epoch best_metric = eval_metric # Only rank = 0 trainer saves modules. if train_config.save_module_checkpoints and rank == 0: model.save_modules( base_path=train_config.modules_save_dir, suffix=f"-ep{epoch}", ) if rank == 0: print(f"Rank {rank} worker: Found a better model!") model_state = model.state_dict() # save to cpu to avoid multiple model copies in gpu memory if cuda_utils.CUDA_ENABLED: for key, state in model_state.items(): model_state[key] = state.cpu() best_model_state = model_state if self.config.early_stop_after > 0 and ( epoch - last_best_epoch == self.config.early_stop_after): print(f"Rank {rank} worker: Eval metric hasn't changed for " + f"{self.config.early_stop_after} epochs. Stopping now.") break sys.stdout.flush() if rank == 0 and best_model_state is not None: if cuda_utils.CUDA_ENABLED: for key, state in best_model_state.items(): best_model_state[key] = state.cuda() model.load_state_dict(best_model_state) return model, best_metric