Exemplo n.º 1
0
 def epoch_finished(self, epoch_num):
     average_value = self.average_value()
     if self.report_average:
         self.writer.add_scalar(
             tag=self.average_tag,
             scalar_value=average_value,
             global_step=epoch_num + 1,
         )
         print_with_time("%s: %f" % (self.average_tag, average_value))
     self.reset_values()
Exemplo n.º 2
0
    def load_model(self, model_file: str, skip_modules: List[str] = []):
        print_with_time("Loading Model: {}".format(model_file))
        input_model_dict = torch.load(model_file,
                                      map_location=torch.device("cpu"))
        filtered_model_dict = OrderedDict()
        for key, val in input_model_dict.items():
            if key.split(".")[0] not in skip_modules:
                filtered_model_dict[key] = val
            else:
                print("Skipping: {}".format(key))

        self.model.load_state_dict(filtered_model_dict, strict=False)
Exemplo n.º 3
0
    def train_1_epoch(self, epoch_number: int, dataloader: DataLoader):
        print_with_time("Training epoch %d ...." % (epoch_number + 1))
        self.model.train()
        for batch in tqdm(dataloader):
            self.on_start_batch(self.iter_num)

            batch_loss = self.train_1_batch(self.iter_num, batch)
            self.track_training_metrics(batch, batch_loss, self.iter_num)

            self.on_end_batch(self.iter_num)
            self.iter_num += 1

        for n in self.metric_names_with_average:
            self.metrics[n].epoch_finished(epoch_number)
Exemplo n.º 4
0
def main():
    cfg = create_cfg()
    # set_seed(cfg.system.seed)
    change_multiprocess_strategy()

    train_db = make_db(cfg, train=True)

    if cfg.training.overfit:
        test_db = train_db
    else:
        test_db = make_db(cfg, train=False)
    model = make_model(
        cfg,
        num_classes=train_db.num_classes,
    )
    loss_weights = make_loss_weights(num_classes=train_db.num_classes,
                                     weights=cfg.loss.class_weight)
    train_evaluator, val_evaluator = make_evaluators(cfg, train_db, test_db,
                                                     model)
    experiment = make_experiment(
        cfg,
        train_db,
        model,
        loss_weights,
        val_evaluator=val_evaluator,
        train_evaluator=train_evaluator,
    )

    if not cfg.training.only_test:
        if cfg.training.pretrained and cfg.training.resume:
            raise ValueError("training.pretrained and training.resume"
                             " flags cannot be True at the same time")
        elif cfg.training.pretrained:
            experiment.init_from_pretrain()
        elif cfg.training.resume:
            experiment.resume()
        experiment.train()
    else:
        experiment.load_model_for_test()

    final_evaluator = make_evaluator_final(cfg, test_db, model)
    final_eval_result = final_evaluator.evaluate()

    print_with_time("Final Evaluation Result ...")
    print(final_eval_result)
    if not cfg.training.only_test:
        print_with_time("Saving final model ...")
        experiment.save()
Exemplo n.º 5
0
    def train(self):
        self._mark_the_run()
        num_epochs = self.cfg.training.num_epochs
        print_with_time(self.cfg.dump())
        print_with_time("Training for run number: {:d}".format(
            self.run_number))
        epoch_range = range(0, num_epochs)
        train_dataloader = create_train_dataloader(self.cfg, self.dataset)
        self.model.to(self.device)
        self.loss_weights = self.loss_weights.to(self.device)

        self.on_start_training()
        for epoch_num in epoch_range:
            self.epoch_number = epoch_num

            # resetting metrics
            for n, m in self.metrics.items():
                m.reset_values()

            # callback
            self.on_start_epoch(epoch_num)

            # train for 1 epoch
            # with torch.autograd.set_detect_anomaly(True):
            self.train_1_epoch(epoch_num, train_dataloader)

            # save
            if (epoch_num + 1) % self.cfg.training.save_every == 0:
                self.save()

            # end of epoch evaluations
            if self.train_evaluator is not None:
                train_eval_result = self.train_evaluator.evaluate()
                print_with_time("Evaluation result on train set ...")
                print(train_eval_result)
                self.update_epoch_metrics_train_eval(train_eval_result,
                                                     epoch_num)
            val_eval_result = self.val_evaluator.evaluate()
            print_with_time("Evaluation result on test set ...")
            print(val_eval_result)
            self.update_epoch_metrics_val_eval(val_eval_result, epoch_num)

            if self.scheduler is not None:
                # plateau scheduler
                if self.scheduler_type_plateau:
                    self.scheduler.step(
                        metrics=self._prepare_plateau_scheduler_input(
                            val_eval_result),
                        epoch=epoch_num,
                    )
                # step scheduler
                else:
                    self.scheduler.step()

            # callback
            self.on_end_epoch(epoch_num)
Exemplo n.º 6
0
    def evaluate(self) -> Dict[str, EvalResult]:
        print_with_time("Evaluating ...")
        self.reset()
        self.model.to(self.device)
        self.model.eval()

        with torch.no_grad():
            for batch in tqdm(self.dataloader):
                self.eval_1_batch(batch)

        result = {"All": self.compute_metrics()}

        if not self.ignore_classes == []:
            result["W/O Ignored Classes"] = self.compute_metrics(
                self.ignore_classes)

        return result
Exemplo n.º 7
0
    def save(self):
        epoch_folder = self.run_folder / str(self.epoch_number + 1)
        epoch_folder.mkdir(exist_ok=True, parents=True)

        model_file = epoch_folder / self.model_filename
        optimizer_file = epoch_folder / self.optimizer_filename
        scheduler_file = epoch_folder / self.scheduler_filename

        print_with_time("Saving model ...")
        torch.save(self.model.cpu().state_dict(), model_file)
        print_with_time("Saving Optimizer ...")
        torch.save(self.optimizer.state_dict(), optimizer_file)
        if self.scheduler is not None:
            print_with_time("Saving Scheduler ...")
            torch.save(self.scheduler, scheduler_file)
Exemplo n.º 8
0
 def load_scheduler(self):
     epoch_folder = self.run_folder / str(self.epoch_number)
     scheduler_file = epoch_folder / self.model_filename
     print_with_time("Loading Scheduler: {}".format(scheduler_file))
     self.model.load_state_dict(torch.load(scheduler_file))
Exemplo n.º 9
0
 def resume(self):
     print_with_time("Resuming the experiment...")
     # TODO
     raise NotImplementedError("I am lazy!")
Exemplo n.º 10
0
 def init_from_pretrain(self):
     print_with_time("Initializing from pretrained weights...")
     model_file = self.cfg.training.pretrained_weight
     self.load_model(model_file, self.cfg.training.skip_modules)
Exemplo n.º 11
0
    def on_start_epoch(self, epoch_num: int):

        print_with_time("Epoch {}, LR: {}".format(epoch_num + 1,
                                                  self.current_lr()))