def evaluate(self, step): val_losses = {} val_losses["loss"] = 0 print("Evaluating model...") with torch.no_grad(): for data in iter(self.val_loader): imgs, targets = data imgs = [image.to(self.config.device) for image in imgs] targets = [{k: v.to(self.config.device) for k, v in t.items()} for t in targets] if self.config.distributed: torch.cuda.synchronize() losses = self.img2pose_model.forward(imgs, targets) if self.config.distributed: losses = reduce_dict(losses) loss = sum(loss for loss in losses.values()) for loss_name in losses.keys(): if loss_name in val_losses: val_losses[loss_name] += losses[loss_name].item() else: val_losses[loss_name] = losses[loss_name].item() val_losses["loss"] += loss.item() for loss_name in val_losses.keys(): if is_main_process(): self.writer.add_scalar( f"val_{loss_name}", round(val_losses[loss_name] / len(self.val_loader), 6), step, ) val_loss = round(val_losses["loss"] / len(self.val_loader), 6) self.checkpoint(val_loss, step) print("Current validation loss: " + f"{val_loss:.6f} at step {step}" + " - Best validation loss: " + f"{self.best_val_loss:.6f} at step {self.best_step}") self.img2pose_model.train() return val_loss
def run(self): self.img2pose_model.train() # accumulate running loss to log into tensorboard running_losses = {} running_losses["loss"] = 0 step = 0 # prints the best step and loss every time it does a validation self.best_step = 0 self.best_val_loss = float("Inf") for epoch in range(self.config.epochs): train_logger = TrainLogger(self.config.batch_size, self.config.frequency_log) idx = 0 for idx, data in enumerate(self.train_loader): imgs, targets = data imgs = [image.to(self.config.device) for image in imgs] targets = [{k: v.to(self.config.device) for k, v in t.items()} for t in targets] self.optimizer.zero_grad() # forward pass losses = self.img2pose_model.forward(imgs, targets) loss = sum(loss for loss in losses.values()) # if loss.item() > 100000: # import ipdb; ipdb.set_trace() # does a backward propagation through the network loss.backward() torch.nn.utils.clip_grad_norm_( self.img2pose_model.fpn_model.parameters(), 10) self.optimizer.step() if self.config.distributed: losses = reduce_dict(losses) loss = sum(loss for loss in losses.values()) for loss_name in losses.keys(): if loss_name in running_losses: running_losses[loss_name] += losses[loss_name].item() else: running_losses[loss_name] = losses[loss_name].item() running_losses["loss"] += loss.item() # saves loss into tensorboard if step % self.tensorboard_loss_every == 0 and step != 0: for loss_name in running_losses.keys(): self.writer.add_scalar( f"train_{loss_name}", running_losses[loss_name] / self.tensorboard_loss_every, step, ) running_losses[loss_name] = 0 train_logger(epoch, self.config.epochs, idx, len(self.train_loader), loss.item()) step += 1 # evaluate model using validation set (if set) if self.config.val_source is not None: val_loss = self.evaluate(step) else: # otherwise just save the model save_model( self.img2pose_model.fpn_model_without_ddp, self.optimizer, self.config, step=step, ) # if validation loss stops decreasing, decrease lr if self.config.lr_plateau and self.config.val_source is not None: self.scheduler.step(val_loss) # early stop model to prevent overfitting if self.config.early_stop and self.config.val_source is not None: self.early_stop(val_loss) if self.early_stop.stop: print("Early stopping model...") break if self.config.val_source is not None: val_loss = self.evaluate(step)