def build_best_trackers(self): self.best_trackers = {} for name in ["loss"] + self.metrics_names: self.best_trackers[name] = stat_tracker.BestPerf( best_perf=None if not hasattr(self.conf, "best_perf") else self.conf.best_perf, larger_is_better=True if "loss" not in name else False, )
def main(conf): try: init_distributed_world(conf, backend=conf.backend) conf.distributed = True and conf.n_mpi_process > 1 except AttributeError as e: print(f"failed to init the distributed world: {e}.") conf.distributed = False # init the config. init_config(conf) # define the timer for different operations. # if we choose the `train_fast` mode, then we will not track the time. conf.timer = Timer( verbosity_level=1 if conf.track_time and not conf.train_fast else 0, log_fn=conf.logger.log_metric, on_cuda=conf.on_cuda, ) # create dataset. data_loader = create_dataset.define_dataset(conf, force_shuffle=True) # create model model = create_model.define_model(conf, data_loader=data_loader) # define the optimizer. optimizer = create_optimizer.define_optimizer(conf, model) # define the lr scheduler. scheduler = create_scheduler.Scheduler(conf, optimizer) # add model with data-parallel wrapper. if conf.graph.on_cuda: if conf.n_sub_process > 1: model = torch.nn.DataParallel(model, device_ids=conf.graph.device) # (optional) reload checkpoint try: checkpoint.maybe_resume_from_checkpoint(conf, model, optimizer, scheduler) except RuntimeError as e: conf.logger.log(f"Resume Error: {e}") conf.resumed = False # train amd evaluate model. if "rnn_lm" in conf.arch: from pcode.distributed_running_nlp import train_and_validate # safety check. assert (conf.n_sub_process == 1 ), "our current data-parallel wrapper does not support RNN." # define the criterion and metrics. criterion = nn.CrossEntropyLoss(reduction="mean") criterion = criterion.cuda() if conf.graph.on_cuda else criterion metrics = create_metrics.Metrics( model.module if "DataParallel" == model.__class__.__name__ else model, task="language_modeling", ) # define the best_perf tracker, either empty or from the checkpoint. best_tracker = stat_tracker.BestPerf( best_perf=None if "best_perf" not in conf else conf.best_perf, larger_is_better=False, ) scheduler.set_best_tracker(best_tracker) # get train_and_validate_func train_and_validate_fn = train_and_validate else: from pcode.distributed_running_cv import train_and_validate # define the criterion and metrics. criterion = nn.CrossEntropyLoss(reduction="mean") criterion = criterion.cuda() if conf.graph.on_cuda else criterion metrics = create_metrics.Metrics( model.module if "DataParallel" == model.__class__.__name__ else model, task="classification", ) # define the best_perf tracker, either empty or from the checkpoint. best_tracker = stat_tracker.BestPerf( best_perf=None if "best_perf" not in conf else conf.best_perf, larger_is_better=True, ) scheduler.set_best_tracker(best_tracker) # get train_and_validate_func train_and_validate_fn = train_and_validate # save arguments to disk. checkpoint.save_arguments(conf) # start training. train_and_validate_fn( conf, model=model, criterion=criterion, scheduler=scheduler, optimizer=optimizer, metrics=metrics, data_loader=data_loader, )