def epoch_finished(self, epoch_num): average_value = self.average_value() if self.report_average: self.writer.add_scalar( tag=self.average_tag, scalar_value=average_value, global_step=epoch_num + 1, ) print_with_time("%s: %f" % (self.average_tag, average_value)) self.reset_values()
def load_model(self, model_file: str, skip_modules: List[str] = []): print_with_time("Loading Model: {}".format(model_file)) input_model_dict = torch.load(model_file, map_location=torch.device("cpu")) filtered_model_dict = OrderedDict() for key, val in input_model_dict.items(): if key.split(".")[0] not in skip_modules: filtered_model_dict[key] = val else: print("Skipping: {}".format(key)) self.model.load_state_dict(filtered_model_dict, strict=False)
def train_1_epoch(self, epoch_number: int, dataloader: DataLoader): print_with_time("Training epoch %d ...." % (epoch_number + 1)) self.model.train() for batch in tqdm(dataloader): self.on_start_batch(self.iter_num) batch_loss = self.train_1_batch(self.iter_num, batch) self.track_training_metrics(batch, batch_loss, self.iter_num) self.on_end_batch(self.iter_num) self.iter_num += 1 for n in self.metric_names_with_average: self.metrics[n].epoch_finished(epoch_number)
def main(): cfg = create_cfg() # set_seed(cfg.system.seed) change_multiprocess_strategy() train_db = make_db(cfg, train=True) if cfg.training.overfit: test_db = train_db else: test_db = make_db(cfg, train=False) model = make_model( cfg, num_classes=train_db.num_classes, ) loss_weights = make_loss_weights(num_classes=train_db.num_classes, weights=cfg.loss.class_weight) train_evaluator, val_evaluator = make_evaluators(cfg, train_db, test_db, model) experiment = make_experiment( cfg, train_db, model, loss_weights, val_evaluator=val_evaluator, train_evaluator=train_evaluator, ) if not cfg.training.only_test: if cfg.training.pretrained and cfg.training.resume: raise ValueError("training.pretrained and training.resume" " flags cannot be True at the same time") elif cfg.training.pretrained: experiment.init_from_pretrain() elif cfg.training.resume: experiment.resume() experiment.train() else: experiment.load_model_for_test() final_evaluator = make_evaluator_final(cfg, test_db, model) final_eval_result = final_evaluator.evaluate() print_with_time("Final Evaluation Result ...") print(final_eval_result) if not cfg.training.only_test: print_with_time("Saving final model ...") experiment.save()
def train(self): self._mark_the_run() num_epochs = self.cfg.training.num_epochs print_with_time(self.cfg.dump()) print_with_time("Training for run number: {:d}".format( self.run_number)) epoch_range = range(0, num_epochs) train_dataloader = create_train_dataloader(self.cfg, self.dataset) self.model.to(self.device) self.loss_weights = self.loss_weights.to(self.device) self.on_start_training() for epoch_num in epoch_range: self.epoch_number = epoch_num # resetting metrics for n, m in self.metrics.items(): m.reset_values() # callback self.on_start_epoch(epoch_num) # train for 1 epoch # with torch.autograd.set_detect_anomaly(True): self.train_1_epoch(epoch_num, train_dataloader) # save if (epoch_num + 1) % self.cfg.training.save_every == 0: self.save() # end of epoch evaluations if self.train_evaluator is not None: train_eval_result = self.train_evaluator.evaluate() print_with_time("Evaluation result on train set ...") print(train_eval_result) self.update_epoch_metrics_train_eval(train_eval_result, epoch_num) val_eval_result = self.val_evaluator.evaluate() print_with_time("Evaluation result on test set ...") print(val_eval_result) self.update_epoch_metrics_val_eval(val_eval_result, epoch_num) if self.scheduler is not None: # plateau scheduler if self.scheduler_type_plateau: self.scheduler.step( metrics=self._prepare_plateau_scheduler_input( val_eval_result), epoch=epoch_num, ) # step scheduler else: self.scheduler.step() # callback self.on_end_epoch(epoch_num)
def evaluate(self) -> Dict[str, EvalResult]: print_with_time("Evaluating ...") self.reset() self.model.to(self.device) self.model.eval() with torch.no_grad(): for batch in tqdm(self.dataloader): self.eval_1_batch(batch) result = {"All": self.compute_metrics()} if not self.ignore_classes == []: result["W/O Ignored Classes"] = self.compute_metrics( self.ignore_classes) return result
def save(self): epoch_folder = self.run_folder / str(self.epoch_number + 1) epoch_folder.mkdir(exist_ok=True, parents=True) model_file = epoch_folder / self.model_filename optimizer_file = epoch_folder / self.optimizer_filename scheduler_file = epoch_folder / self.scheduler_filename print_with_time("Saving model ...") torch.save(self.model.cpu().state_dict(), model_file) print_with_time("Saving Optimizer ...") torch.save(self.optimizer.state_dict(), optimizer_file) if self.scheduler is not None: print_with_time("Saving Scheduler ...") torch.save(self.scheduler, scheduler_file)
def load_scheduler(self): epoch_folder = self.run_folder / str(self.epoch_number) scheduler_file = epoch_folder / self.model_filename print_with_time("Loading Scheduler: {}".format(scheduler_file)) self.model.load_state_dict(torch.load(scheduler_file))
def resume(self): print_with_time("Resuming the experiment...") # TODO raise NotImplementedError("I am lazy!")
def init_from_pretrain(self): print_with_time("Initializing from pretrained weights...") model_file = self.cfg.training.pretrained_weight self.load_model(model_file, self.cfg.training.skip_modules)
def on_start_epoch(self, epoch_num: int): print_with_time("Epoch {}, LR: {}".format(epoch_num + 1, self.current_lr()))