def main( cfg, output_dir, runner=None, eval_only=False, # NOTE: always enable resume when running on cluster resume=True, ): setup_after_launch(cfg, output_dir, runner) model = runner.build_model(cfg) logger.info("Model:\n{}".format(model)) if eval_only: checkpointer = runner.build_checkpointer(cfg, model, save_dir=output_dir) # checkpointer.resume_or_load() will skip all additional checkpointable # which may not be desired like ema states if resume and checkpointer.has_checkpoint(): checkpoint = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume) else: checkpoint = checkpointer.load(cfg.MODEL.WEIGHTS) train_iter = checkpoint.get("iteration", None) model.eval() metrics = runner.do_test(cfg, model, train_iter=train_iter) print_metrics_table(metrics) return { "accuracy": metrics, "model_configs": {}, "metrics": metrics, } model = create_ddp_model( model, fp16_compression=cfg.MODEL.DDP_FP16_GRAD_COMPRESS, device_ids=None if cfg.MODEL.DEVICE == "cpu" else [comm.get_local_rank()], broadcast_buffers=False, find_unused_parameters=cfg.MODEL.DDP_FIND_UNUSED_PARAMETERS, ) trained_cfgs = runner.do_train(cfg, model, resume=resume) metrics = runner.do_test(cfg, model) print_metrics_table(metrics) # dump config files for trained models trained_model_configs = dump_trained_model_configs(cfg.OUTPUT_DIR, trained_cfgs) return { # for e2e_workflow "accuracy": metrics, # for unit_workflow "model_configs": trained_model_configs, "metrics": metrics, }
def main( cfg, output_dir, runner, # binary specific optional arguments predictor_types: typing.List[str], device: str = "cpu", compare_accuracy: bool = False, skip_if_fail: bool = False, ): if compare_accuracy: raise NotImplementedError( "compare_accuracy functionality isn't currently supported.") # NOTE: dict for metrics of all exported models (and original pytorch model) # ret["accuracy_comparison"] = accuracy_comparison cfg = copy.deepcopy(cfg) setup_after_launch(cfg, output_dir, runner) with temp_defrost(cfg): cfg.merge_from_list(["MODEL.DEVICE", device]) model = runner.build_model(cfg, eval_only=True) # NOTE: train dataset is used to avoid leakage since the data might be used for # running calibration for quantization. test_loader is used to make sure it follows # the inference behaviour (augmentation will not be applied). datasets = list(cfg.DATASETS.TRAIN) data_loader = runner.build_detection_test_loader(cfg, datasets) logger.info("Running the pytorch model and print FLOPS ...") first_batch = next(iter(data_loader)) input_args = (first_batch, ) flops_utils.print_model_flops(model, input_args) predictor_paths: typing.Dict[str, str] = {} for typ in predictor_types: # convert_and_export_predictor might alter the model, copy before calling it pytorch_model = copy.deepcopy(model) try: predictor_path = convert_and_export_predictor( cfg, pytorch_model, typ, output_dir, data_loader, ) logger.info( f"Predictor type {typ} has been exported to {predictor_path}") predictor_paths[typ] = predictor_path except Exception as e: logger.exception(f"Export {typ} predictor failed: {e}") if not skip_if_fail: raise e ret = {"predictor_paths": predictor_paths, "accuracy_comparison": {}} return ret
def main( cfg: CfgNode, output_dir: str, task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask, eval_only: bool = False, num_machines: int = 1, num_processes: int = 1, ) -> TrainOutput: """Main function for launching a training with lightning trainer Args: cfg: D2go config node num_machines: Number of nodes used for distributed training num_processes: Number of processes on each node. eval_only: True if run evaluation only. """ # FIXME: make comm.get_world_size() work properly. setup_after_launch(cfg, output_dir, _scale_world_size=False) auto_scale_world_size(cfg, new_world_size=num_machines * num_processes) task = task_cls.from_config(cfg, eval_only) trainer_params = get_trainer_params(cfg, num_machines, num_processes) last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt") if PathManager.exists(last_checkpoint): # resume training from checkpoint trainer_params["resume_from_checkpoint"] = last_checkpoint logger.info(f"Resuming training from checkpoint: {last_checkpoint}.") trainer = pl.Trainer(**trainer_params) model_configs = None if eval_only: do_test(trainer, task) else: model_configs = do_train(cfg, trainer, task) return TrainOutput( output_dir=cfg.OUTPUT_DIR, tensorboard_log_dir=trainer_params["logger"].log_dir, accuracy=task.eval_res, model_configs=model_configs, )
def main( cfg, output_dir, runner, # binary specific optional arguments predictor_path, num_threads=None, caffe2_engine=None, caffe2_logging_print_net_summary=0, ): torch.backends.quantized.engine = cfg.QUANTIZATION.BACKEND print("run with quantized engine: ", torch.backends.quantized.engine) setup_after_launch(cfg, output_dir, runner) caffe2_global_init(caffe2_logging_print_net_summary, num_threads) predictor = create_predictor(predictor_path) metrics = runner.do_test(cfg, predictor) print_metrics_table(metrics) return { "accuracy": metrics, "metrics": metrics, }
def setup(self, stage: str): setup_after_launch(self.cfg, self.cfg.OUTPUT_DIR, runner=None)
def main( cfg, output_dir, runner=None, is_train=True, ): setup_after_launch(cfg, output_dir, runner) if is_train: data_loader = runner.build_detection_train_loader(cfg) else: assert len(cfg.DATASETS.TEST) > 0, cfg.DATASETS.TEST data_loader = runner.build_detection_test_loader( cfg, cfg.DATASETS.TEST[0]) TOTAL_BENCHMARK_TIME = (100 if get_launch_environment() == "local" else 600 ) # run benchmark for 10 min LOGGING_METER_WINDOW_SIZE = 20 LOGGING_METER_TIME_INTERVAL = 5 WARMUP_ITERS = 5 # initialize time_per_iter = HistoryBuffer(max_length=10000) total_time = 0 start = time.time() for no, batch in enumerate(data_loader): data_time = time.time() - start time_per_iter.update(data_time) total_time += data_time if no == 0: logger.info("Show the first batch as example:\n{}".format(batch)) # Assume batch size is constant batch_size = cfg.SOLVER.IMS_PER_BATCH // comm.get_world_size() assert len(batch) * batch_size median = time_per_iter.median(window_size=LOGGING_METER_WINDOW_SIZE) avg = time_per_iter.avg(window_size=LOGGING_METER_WINDOW_SIZE) log_every_n_seconds( logging.INFO, "iter: {};" " recent per-iter seconds: {:.4f} (avg) {:.4f} (median);" " recent per-image seconds: {:.4f} (avg) {:.4f} (median).".format( no, avg, median, avg / batch_size, median / batch_size, ), n=LOGGING_METER_TIME_INTERVAL, ) # Synchronize between processes, exit when all processes are running for enough # time. This mimic the loss.backward(), the logged time doesn't include the time # for synchronize. finished = comm.all_gather(total_time >= TOTAL_BENCHMARK_TIME) if all(x for x in finished): logger.info( "Benchmarking finished after {} seconds".format(total_time)) break start = time.time() dataset_name = ":".join( cfg.DATASETS.TRAIN) if is_train else cfg.DATASETS.TEST[0] time_per_iter = [x[0] for x in time_per_iter.values()] time_per_iter = time_per_iter[ min(WARMUP_ITERS, max(len(time_per_iter) - WARMUP_ITERS, 0)):] results = { "environment": { "num_workers": cfg.DATALOADER.NUM_WORKERS, "world_size": comm.get_world_size(), "processes_per_machine": get_num_processes_per_machine(), }, "main_processes_stats": { "batch_size_per_process": batch_size, "per_iter_avg": np.average(time_per_iter), "per_iter_p1": np.percentile(time_per_iter, 1, interpolation="nearest"), "per_iter_p10": np.percentile(time_per_iter, 10, interpolation="nearest"), "per_iter_p50": np.percentile(time_per_iter, 50, interpolation="nearest"), "per_iter_p90": np.percentile(time_per_iter, 90, interpolation="nearest"), "per_iter_p99": np.percentile(time_per_iter, 99, interpolation="nearest"), "per_image_avg": np.average(time_per_iter) / batch_size, "per_image_p1": np.percentile(time_per_iter, 1, interpolation="nearest") / batch_size, "per_image_p10": np.percentile(time_per_iter, 10, interpolation="nearest") / batch_size, "per_image_p50": np.percentile(time_per_iter, 50, interpolation="nearest") / batch_size, "per_image_p90": np.percentile(time_per_iter, 90, interpolation="nearest") / batch_size, "per_image_p99": np.percentile(time_per_iter, 99, interpolation="nearest") / batch_size, }, "data_processes_stats": {}, # TODO: add worker stats } # Metrics follows the hierarchy of: name -> dataset -> task -> metrics -> number metrics = {"_name_": {dataset_name: results}} print_metrics_table(metrics) return { "accuracy": metrics, "metrics": metrics, }