示例#1
0
def main(args, config, build_model):
    config.merge_from_list(args.opts)
    cfg = default_setup(config, args)
    """
    If you'd like to do anything fancier than the standard training logic,
    consider writing your own training loop or subclassing the runner.
    """
    runner = runner_decrator(RUNNERS.get(cfg.TRAINER.NAME))(cfg, build_model)
    runner.resume_or_load(resume=args.resume)

    extra_hooks = []
    if args.clearml:
        from cvpods.engine.clearml import ClearMLHook
        if comm.is_main_process():
            extra_hooks.append(ClearMLHook())
    if cfg.TEST.AUG.ENABLED:
        extra_hooks.append(
            hooks.EvalHook(0, lambda: runner.test_with_TTA(cfg, runner.model)))
    if extra_hooks:
        runner.register_hooks(extra_hooks)

    logger.info("Running with full config:\n{}".format(cfg))
    base_config = cfg.__class__.__base__()
    logger.info("different config with base class:\n{}".format(
        cfg.diff(base_config)))

    runner.train()

    if comm.is_main_process() and cfg.MODEL.AS_PRETRAIN:
        # convert last ckpt to pretrain format
        convert_to_pretrained_model(input=os.path.join(cfg.OUTPUT_DIR,
                                                       "model_final.pth"),
                                    save_path=os.path.join(
                                        cfg.OUTPUT_DIR,
                                        "model_final_pretrain_weight.pkl"))
示例#2
0
    def build_hooks(self):
        """
        Build a list of default hooks, including timing, evaluation,
        checkpointing, lr scheduling, precise BN, writing events.

        Returns:
            list[HookBase]:
        """
        cfg = self.cfg
        # cfg.DATALOADER.NUM_WORKERS = 0  # save some memory and time for PreciseBN

        ret = [
            hooks.OptimizationHook(
                accumulate_grad_steps=cfg.SOLVER.BATCH_SUBDIVISIONS,
                grad_clipper=None,
                mixed_precision=cfg.TRAINER.FP16.ENABLED),
            hooks.LRScheduler(self.optimizer, self.scheduler),
            hooks.IterationTimer(),
            hooks.PreciseBN(
                # Run at the same freq as (but before) evaluation.
                cfg.TEST.EVAL_PERIOD,
                self.model,
                # Build a new data loader to not affect training
                self.build_train_loader(cfg),
                cfg.TEST.PRECISE_BN.NUM_ITER,
            ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
            else None,
        ]

        # Do PreciseBN before checkpointer, because it updates the model and need to
        # be saved by checkpointer.
        # This is not always the best: if checkpointing has a different frequency,
        # some checkpoints may have more precise statistics than others.
        if comm.is_main_process():
            ret.append(
                hooks.PeriodicCheckpointer(self.checkpointer,
                                           cfg.SOLVER.CHECKPOINT_PERIOD,
                                           max_iter=self.max_iter,
                                           max_epoch=self.max_epoch))

        def test_and_save_results():
            self._last_eval_results = self.test(self.cfg, self.model)
            return self._last_eval_results

        # Do evaluation after checkpointer, because then if it fails,
        # we can use the saved checkpoint to debug.
        ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))

        if comm.is_main_process():
            # Here the default print/log frequency of each writer is used.
            # run writers in the end, so that evaluation metrics are written
            ret.append(
                hooks.PeriodicWriter(self.build_writers(),
                                     period=self.cfg.GLOBAL.LOG_INTERVAL))
            # Put `PeriodicDumpLog` after writers so that can dump all the files,
            # including the files generated by writers

        return ret
示例#3
0
def stage_main(args, cfg, build):
    cfg.merge_from_list(args.opts)
    cfg, logger = default_setup(cfg, args)
    model_build_func = build
    """
    If you'd like to do anything fancier than the standard training logic,
    consider writing your own training loop or subclassing the trainer.
    """
    trainer = Trainer(cfg, model_build_func)
    trainer.resume_or_load(resume=args.resume)

    if args.eval_only:
        DefaultCheckpointer(trainer.model,
                            save_dir=cfg.OUTPUT_DIR,
                            resume=args.resume).resume_or_load(
                                cfg.MODEL.WEIGHTS, resume=args.resume)
        res = Trainer.test(cfg, trainer.model)
        if comm.is_main_process():
            verify_results(cfg, res)
        if cfg.TEST.AUG.ENABLED:
            res.update(Trainer.test_with_TTA(cfg, trainer.model))
        return res

    # check wheather worksapce has enough storeage space
    # assume that a single dumped model is 700Mb
    file_sys = os.statvfs(cfg.OUTPUT_DIR)
    free_space_Gb = (file_sys.f_bfree * file_sys.f_frsize) / 2**30
    eval_space_Gb = (cfg.SOLVER.LR_SCHEDULER.MAX_ITER //
                     cfg.SOLVER.CHECKPOINT_PERIOD) * 700 / 2**10
    if eval_space_Gb > free_space_Gb:
        logger.warning(f"{Fore.RED}Remaining space({free_space_Gb}GB) "
                       f"is less than ({eval_space_Gb}GB){Style.RESET_ALL}")

    if cfg.TEST.AUG.ENABLED:
        trainer.register_hooks([
            hooks.EvalHook(0,
                           lambda: trainer.test_with_TTA(cfg, trainer.model))
        ])

    trainer.train()

    if comm.is_main_process() and cfg.MODEL.AS_PRETRAIN:
        # convert last ckpt to pretrain format
        convert_to_pretrained_model(input=os.path.join(cfg.OUTPUT_DIR,
                                                       "model_final.pth"),
                                    save_path=os.path.join(
                                        cfg.OUTPUT_DIR,
                                        "model_final_pretrain_weight.pkl"))
示例#4
0
    def build_hooks(self):
        """
        Build a list of default hooks, including timing, evaluation,
        checkpointing, lr scheduling, precise BN, writing events.

        Returns:
            list[HookBase]:
        """
        cfg = self.cfg
        cfg.DATALOADER.NUM_WORKERS = 0  # save some memory and time for PreciseBN

        ret = [
            hooks.IterationTimer(),
            hooks.LRScheduler(self.optimizer, self.scheduler),
            hooks.PreciseBN(
                # Run at the same freq as (but before) evaluation.
                cfg.TEST.EVAL_PERIOD,
                self.model,
                # Build a new data loader to not affect training
                self.build_train_loader(cfg),
                cfg.TEST.PRECISE_BN.NUM_ITER,
            ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
            else None,
        ]

        # Do PreciseBN before checkpointer, because it updates the model and need to
        # be saved by checkpointer.
        # This is not always the best: if checkpointing has a different frequency,
        # some checkpoints may have more precise statistics than others.
        if comm.is_main_process():
            ret.append(
                hooks.PeriodicCheckpointer(self.checkpointer,
                                           cfg.SOLVER.CHECKPOINT_PERIOD))

        def test_and_save_results():
            self._last_eval_results = self.test(self.cfg, self.model)
            return self._last_eval_results

        # Do evaluation after checkpointer, because then if it fails,
        # we can use the saved checkpoint to debug.
        ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))

        if comm.is_main_process():
            # run writers in the end, so that evaluation metrics are written
            ret.append(
                hooks.PeriodicWriter(self.build_writers(),
                                     period=self.cfg.GLOBAL.LOG_INTERVAL))
        return ret
示例#5
0
    def _write_metrics(self, metrics_dict: dict):
        """
        Args:
            metrics_dict (dict): dict of scalar metrics
        """
        metrics_dict = {
            k: v.detach().cpu().item()
            if isinstance(v, torch.Tensor) else float(v)
            for k, v in metrics_dict.items()
        }
        # gather metrics among all workers for logging
        # This assumes we do DDP-style training, which is currently the only
        # supported method in cvpods.
        all_metrics_dict = comm.gather(metrics_dict)

        if comm.is_main_process():
            if "data_time" in all_metrics_dict[0]:
                # data_time among workers can have high variance. The actual latency
                # caused by data_time is the maximum among workers.
                data_time = np.max(
                    [x.pop("data_time") for x in all_metrics_dict])
                self.storage.put_scalar("data_time", data_time)

            # average the rest metrics
            metrics_dict = {
                k: np.mean([x[k] for x in all_metrics_dict])
                for k in all_metrics_dict[0].keys()
            }
            total_losses_reduced = sum(loss
                                       for key, loss in metrics_dict.items()
                                       if "loss" in key)

            self.storage.put_scalar("total_loss", total_losses_reduced)
            if len(metrics_dict) > 1:
                self.storage.put_scalars(**metrics_dict)
 def __init__(self,
              model,
              save_dir="",
              resume=False,
              *,
              save_to_disk=None,
              **checkpointables):
     """
     Args:
         model (nn.Module): model.
         save_dir (str): a directory to save and find checkpoints.
         resume (bool): indicate whether to resume from latest checkpoint or start from scratch.
         save_to_disk (bool): if True, save checkpoint to disk, otherwise
             disable saving for this checkpointer.
         checkpointables (object): any checkpointable objects, i.e., objects
             that have the `state_dict()` and `load_state_dict()` method. For
             example, it can be used like
             `Checkpointer(model, "dir", optimizer=optimizer)`.
     """
     is_main_process = comm.is_main_process()
     super().__init__(
         model,
         save_dir,
         resume,
         save_to_disk=is_main_process
         if save_to_disk is None else save_to_disk,
         **checkpointables,
     )
    def evaluate(self):
        if self._distributed:
            comm.synchronize()
            self._predictions = comm.gather(self._predictions, dst=0)
            self._predictions = list(itertools.chain(*self._predictions))

            if not comm.is_main_process():
                return {}

        if len(self._predictions) == 0:
            logger.warning(
                "[COCOEvaluator] Did not receive valid predictions.")
            return {}

        if self._output_dir:
            ensure_dir(self._output_dir)
            file_path = os.path.join(self._output_dir,
                                     "instances_predictions.pth")
            with megfile.smart_open(file_path, "wb") as f:
                torch.save(self._predictions, f)

        self._results = OrderedDict()
        if "instances" in self._predictions[0]:
            self._eval_predictions(set(self._tasks))

        if self._dump:
            _dump_to_markdown(self._dump_infos)

        # Copy so the caller can do whatever with results
        return copy.deepcopy(self._results)
示例#8
0
def main(args):
    config.merge_from_list(args.opts)
    cfg, logger = default_setup(config, args)
    if args.debug:
        batches = int(cfg.SOLVER.IMS_PER_DEVICE * args.num_gpus)
        if cfg.SOLVER.IMS_PER_BATCH != batches:
            cfg.SOLVER.IMS_PER_BATCH = batches
            logger.warning(
                "SOLVER.IMS_PER_BATCH is changed to {}".format(batches))

    valid_files = get_valid_files(args, cfg, logger)
    # * means all if need specific format then *.csv
    for current_file in valid_files:
        cfg.MODEL.WEIGHTS = current_file
        model = build_model(cfg)

        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
            cfg.MODEL.WEIGHTS, resume=args.resume)
        if cfg.TEST.AUG.ENABLED:
            res = Trainer.test_with_TTA(cfg, model)
        else:
            res = Trainer.test(cfg, model)

        if comm.is_main_process():
            verify_results(cfg, res)
示例#9
0
    def evaluate_files(self):
        """
        Only evaluate files without inference
        """
        if self._distributed:
            comm.synchronize()

            if not comm.is_main_process():
                return

        del self._predictions

        if self._output_dir:
            file_path = os.path.join(self._output_dir,
                                     "instances_predictions.pth")
            self._predictions = torch.load(file_path)
            logger.info("Read predictions from {}".format(file_path))
        else:
            logger.warning(
                "Stored predictions is None, you need to run the inference_on_dataset"
            )
            raise NotImplementedError

        self._results = OrderedDict()
        if "proposals" in self._predictions[0]:
            self._eval_box_proposals()
        if "instances" in self._predictions[0]:
            self._eval_predictions(set(self._tasks))

        if self._dump:
            _dump_to_markdown(self._dump_infos)

        # Copy so the caller can do whatever with results
        return copy.deepcopy(self._results)
示例#10
0
    def evaluate(self):
        if self._distributed:
            comm.synchronize()
            self._predictions = comm.gather(self._predictions, dst=0)
            self._predictions = list(itertools.chain(*self._predictions))

            self._targets = comm.gather(self._targets, dst=0)
            self._targets = list(itertools.chain(*self._targets))

            if not comm.is_main_process():
                return {}

        if len(self._predictions) == 0:
            self._logger.warning("[ClassificationEvaluator] Did not receive valid predictions.")
            return {}

        if self._output_dir:
            PathManager.mkdirs(self._output_dir)
            file_path = os.path.join(self._output_dir, "instances_predictions.pth")
            with PathManager.open(file_path, "wb") as f:
                torch.save(self._predictions, f)

        self._results = OrderedDict()
        assert len(self._predictions) == len(self._targets)
        if self._predictions[0] is not None:
            self._eval_classification_accuracy()

        if self._dump:
            _dump_to_markdown(self._dump_infos)

        # Copy so the caller can do whatever with results
        return copy.deepcopy(self._results)
示例#11
0
    def evaluate(self):
        """
        Returns:
            dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75".
        """
        all_predictions = comm.gather(self._predictions, dst=0)
        if not comm.is_main_process():
            return
        predictions = defaultdict(list)
        for predictions_per_rank in all_predictions:
            for clsid, lines in predictions_per_rank.items():
                predictions[clsid].extend(lines)
        del all_predictions

        self._logger.info(
            "Evaluating {} using {} metric. "
            "Note that results do not use the official Matlab API.".format(
                self._dataset_name, 2007 if self._is_2007 else 2012))

        with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname:
            res_file_template = os.path.join(dirname, "{}.txt")

            aps = defaultdict(list)  # iou -> ap per class
            for cls_id, cls_name in enumerate(self._class_names):
                lines = predictions.get(cls_id, [""])

                with open(res_file_template.format(cls_name), "w") as f:
                    f.write("\n".join(lines))

                for thresh in range(50, 100, 5):
                    rec, prec, ap = voc_eval(
                        res_file_template,
                        self._anno_file_template,
                        self._image_set_path,
                        cls_name,
                        ovthresh=thresh / 100.0,
                        use_07_metric=self._is_2007,
                    )
                    aps[thresh].append(ap * 100)

        ret = OrderedDict()
        mAP = {iou: np.mean(x) for iou, x in aps.items()}
        ret["bbox"] = {
            "AP": np.mean(list(mAP.values())),
            "AP50": mAP[50],
            "AP75": mAP[75]
        }

        small_table = create_small_table(ret["bbox"])
        self._logger.info("Evaluation results for bbox: \n" + small_table)

        if self._dump:
            dump_info_one_task = {
                "task": "bbox",
                "tables": [small_table],
            }
            _dump_to_markdown([dump_info_one_task])
        return ret
示例#12
0
    def test(cls, cfg, model, evaluators=None, output_folder=None):
        """
        Args:
            cfg (config dict):
            model (nn.Module):
            evaluators (list[DatasetEvaluator] or None): if None, will call
                :meth:`build_evaluator`. Otherwise, must have the same length as
                ``cfg.DATASETS.TEST``.

        Returns:
            dict: a dict of result metrics
        """
        if isinstance(evaluators, DatasetEvaluator):
            evaluators = [evaluators]
        if evaluators is not None:
            assert len(
                cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format(
                    len(cfg.DATASETS.TEST), len(evaluators))

        results = OrderedDict()
        for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
            data_loader = cls.build_test_loader(cfg)
            # When evaluators are passed in as arguments,
            # implicitly assume that evaluators can be created before data_loader.
            if evaluators is not None:
                evaluator = evaluators[idx]
            else:
                try:
                    evaluator = cls.build_evaluator(
                        cfg,
                        dataset_name,
                        data_loader.dataset,
                        output_folder=output_folder)
                except NotImplementedError:
                    logger.warn(
                        "No evaluator found. Use `DefaultRunner.test(evaluators=)`, "
                        "or implement its `build_evaluator` method.")
                    results[dataset_name] = {}
                    continue
            results_i = inference_on_dataset(model, data_loader, evaluator)
            if cfg.TEST.ON_FILES:
                results_i = inference_on_files(evaluator)
            else:
                results_i = inference_on_dataset(model, data_loader, evaluator)
            results[dataset_name] = results_i
            if comm.is_main_process():
                assert isinstance(
                    results_i, dict
                ), "Evaluator must return a dict on the main process. Got {} instead.".format(
                    results_i)
                logger.info("Evaluation results for {} in csv format:".format(
                    dataset_name))
                print_csv_format(results_i)

        if len(results) == 1:
            results = list(results.values())[0]
        return results
示例#13
0
    def evaluate(self):
        comm.synchronize()

        self._predictions = comm.gather(self._predictions)
        self._predictions = list(itertools.chain(*self._predictions))
        if not comm.is_main_process():
            return

        # gt_json = PathManager.get_local_path(self._metadata.panoptic_json)
        gt_json = self._metadata.panoptic_json
        gt_folder = self._metadata.panoptic_root

        with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir:
            logger.info(
                "Writing all panoptic predictions to {} ...".format(pred_dir))
            for p in self._predictions:
                with open(os.path.join(pred_dir, p["file_name"]), "wb") as f:
                    f.write(p.pop("png_string"))

            with open(gt_json, "r") as f:
                json_data = json.load(f)
            json_data["annotations"] = self._predictions
            with megfile.smart_open(self._predictions_json, "w") as f:
                f.write(json.dumps(json_data))

            from panopticapi.evaluation import pq_compute

            with contextlib.redirect_stdout(io.StringIO()):
                pq_res = pq_compute(
                    gt_json,
                    self._predictions_json,
                    gt_folder=gt_folder,
                    pred_folder=pred_dir,
                )

        res = {}
        res["PQ"] = 100 * pq_res["All"]["pq"]
        res["SQ"] = 100 * pq_res["All"]["sq"]
        res["RQ"] = 100 * pq_res["All"]["rq"]
        res["PQ_th"] = 100 * pq_res["Things"]["pq"]
        res["SQ_th"] = 100 * pq_res["Things"]["sq"]
        res["RQ_th"] = 100 * pq_res["Things"]["rq"]
        res["PQ_st"] = 100 * pq_res["Stuff"]["pq"]
        res["SQ_st"] = 100 * pq_res["Stuff"]["sq"]
        res["RQ_st"] = 100 * pq_res["Stuff"]["rq"]

        results = OrderedDict({"panoptic_seg": res})
        table = _print_panoptic_results(pq_res)

        if self._dump:
            dump_info_one_task = {
                "task": "panoptic_seg",
                "tables": [table],
            }
            _dump_to_markdown([dump_info_one_task])

        return results
示例#14
0
def main(args):
    config.merge_from_list(args.opts)
    cfg, logger = default_setup(config, args)
    if args.debug:
        batches = int(cfg.SOLVER.IMS_PER_BATCH / 8 * args.num_gpus)
        if cfg.SOLVER.IMS_PER_BATCH != batches:
            cfg.SOLVER.IMS_PER_BATCH = batches
            logger.warning(
                "SOLVER.IMS_PER_BATCH is changed to {}".format(batches))

    if "MODEL.WEIGHTS" in args.opts:
        if cfg.MODEL.WEIGHTS.endswith(".pth") and not PathManager.exists(
                cfg.MODEL.WEIGHTS):
            ckpt_name = cfg.MODEL.WEIGHTS.split("/")[-1]
            model_prefix = cfg.OUTPUT_DIR.split("cvpods_playground")[1][1:]
            remote_file_path = os.path.join(cfg.OSS.DUMP_PREFIX, model_prefix,
                                            ckpt_name)
            logger.warning(
                f"The specified ckpt file ({cfg.MODEL.WEIGHTS}) was not found locally,"
                f" try to load the corresponding dump file on OSS ({remote_file_path})."
            )
            cfg.MODEL.WEIGHTS = remote_file_path
        valid_files = [cfg.MODEL.WEIGHTS]
    else:
        list_of_files = glob.glob(os.path.join(cfg.OUTPUT_DIR, '*.pth'))

        assert list_of_files, "No checkpoint file found in {}.".format(
            cfg.OUTPUT_DIR)
        list_of_files.sort(key=os.path.getctime)
        latest_file = list_of_files[-1]
        if not args.end_iter:
            valid_files = [latest_file]
        else:
            files = [f for f in list_of_files if str(f) <= str(latest_file)]
            valid_files = []
            for f in files:
                try:
                    model_iter = int(re.split(r'(model_|\.pth)', f)[-3])
                except Exception:
                    logger.warning("remove {}".format(f))
                    continue
                if args.start_iter <= model_iter <= args.end_iter:
                    valid_files.append(f)
            assert valid_files, "No .pth files satisfy your requirement"

    # * means all if need specific format then *.csv
    for current_file in valid_files:
        cfg.MODEL.WEIGHTS = current_file
        model = build_model(cfg)

        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
            cfg.MODEL.WEIGHTS, resume=args.resume)
        res = Trainer.test(cfg, model)
        if comm.is_main_process():
            verify_results(cfg, res)
        if cfg.TEST.AUG.ENABLED:
            res.update(Trainer.test_with_TTA(cfg, model))
示例#15
0
def default_setup(cfg, args):
    """
    Perform some basic common setups at the beginning of a job, including:

    1. Set up the cvpods logger
    2. Log basic information about environment, cmdline arguments, and config
    3. Backup the config to the output directory

    Args:
        cfg (BaseConfig): the full config to be used
        args (argparse.NameSpace): the command line arguments to be logged
    """
    output_dir = cfg.OUTPUT_DIR
    if comm.is_main_process() and output_dir:
        PathManager.mkdirs(output_dir)

    rank = comm.get_rank()
    # setup_logger(output_dir, distributed_rank=rank, name="cvpods")
    logger = setup_logger(output_dir, distributed_rank=rank)

    logger.info("Rank of current process: {}. World size: {}".format(
        rank, comm.get_world_size()))
    logger.info("Environment info:\n" + collect_env_info())

    logger.info("Command line arguments: " + str(args))
    if hasattr(args, "config_file") and args.config_file != "":
        logger.info("Contents of args.config_file={}:\n{}".format(
            args.config_file,
            PathManager.open(args.config_file, "r").read()))

    adjust_config(cfg)
    logger.info("Running with full config:\n{}".format(cfg))
    base_config = cfg.__class__.__base__()
    logger.info("different config with base class:\n{}".format(
        cfg.diff(base_config)))
    # if comm.is_main_process() and output_dir:
    #     # Note: some of our scripts may expect the existence of
    #     # config.yaml in output directory
    #     path = os.path.join(output_dir, "config.yaml")
    #     with PathManager.open(path, "w") as f:
    #         f.write(cfg.dump())
    #     logger.info("Full config saved to {}".format(os.path.abspath(path)))

    # make sure each worker has a different, yet deterministic seed if specified

    seed = seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank)
    # save seed to config for dump
    cfg.SEED = seed

    # cudnn benchmark has large overhead. It shouldn't be used considering the small size of
    # typical validation set.
    if not (hasattr(args, "eval_only") and args.eval_only):
        torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK

    return cfg, logger
示例#16
0
 def evaluate(self):
     results = OrderedDict()
     for evaluator in self._evaluators:
         result = evaluator.evaluate()
         if comm.is_main_process() and result is not None:
             for k, v in result.items():
                 assert (
                     k not in results
                 ), "Different evaluators produce results with the same key {}".format(k)
                 results[k] = v
     return results
示例#17
0
    def train(self):
        """
        Run training.

        Returns:
            OrderedDict of results, if evaluation is enabled. Otherwise None.
        """
        super().train(self.start_iter, self.max_iter, self.max_epoch)

        if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process():
            assert hasattr(self, "_last_eval_results"
                           ), "No evaluation results obtained during training!"
            verify_results(self.cfg, self._last_eval_results)
            return self._last_eval_results
示例#18
0
def do_test(cfg, model):
    results = OrderedDict()
    for dataset_name in cfg.DATASETS.TEST:
        data_loader = build_test_loader(cfg, dataset_name)
        evaluator = get_evaluator(
            cfg, dataset_name, os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
        )
        results_i = inference_on_dataset(model, data_loader, evaluator)
        results[dataset_name] = results_i
        if comm.is_main_process():
            logger.info("Evaluation results for {} in csv format:".format(dataset_name))
            print_csv_format(results_i)
    if len(results) == 1:
        results = list(results.values())[0]
    return results
示例#19
0
    def _write_metrics(
        self,
        loss_dict: Dict[str, torch.Tensor],
        data_time: float,
        prefix: str = "",
    ):
        """
        Args:
            loss_dict (dict): dict of scalar losses
            data_time (float): time taken by the dataloader iteration
        """
        device = next(iter(loss_dict.values())).device

        # Use a new stream so these ops don't wait for DDP or backward
        with torch.cuda.stream(torch.cuda.Stream() if device.type ==
                               "cuda" else None):
            metrics_dict = {
                k: v.detach().cpu().item()
                for k, v in loss_dict.items()
            }
            metrics_dict["data_time"] = data_time

            # Gather metrics among all workers for logging
            # This assumes we do DDP-style training, which is currently the only
            # supported method in cvpods.
            all_metrics_dict = comm.gather(metrics_dict)

        if comm.is_main_process():
            storage = get_event_storage()

            # data_time among workers can have high variance. The actual latency
            # caused by data_time is the maximum among workers.
            data_time = np.max([x.pop("data_time") for x in all_metrics_dict])
            storage.put_scalar("data_time", data_time)

            # average the rest metrics
            metrics_dict = {
                k: np.mean([x[k] for x in all_metrics_dict])
                for k in all_metrics_dict[0].keys()
            }
            total_losses_reduced = sum(loss
                                       for key, loss in metrics_dict.items()
                                       if "loss" in key)
            storage.put_scalar("{}total_loss".format(prefix),
                               total_losses_reduced)
            if len(metrics_dict) > 1:
                storage.put_scalars(**metrics_dict)
示例#20
0
def default_setup(cfg, args):
    """
    Perform some basic common setups at the beginning of a job, including:

    1. Set up the cvpods logger
    2. Log basic information about environment, cmdline arguments, and config
    3. Backup the config to the output directory

    Args:
        cfg (BaseConfig): the full config to be used
        args (argparse.NameSpace): the command line arguments to be logged
    """
    output_dir = cfg.OUTPUT_DIR
    if comm.is_main_process() and output_dir:
        ensure_dir(output_dir)

    rank = comm.get_rank()
    # setup_logger(output_dir, distributed_rank=rank, name="cvpods")
    setup_logger(output_dir, distributed_rank=rank)

    logger.info("Rank of current process: {}. World size: {}".format(
        rank, comm.get_world_size()))
    logger.info("Environment info:\n" + collect_env_info())

    logger.info("Command line arguments: " + str(args))
    if hasattr(args, "config_file") and args.config_file != "":
        logger.info("Contents of args.config_file={}:\n{}".format(
            args.config_file,
            megfile.smart_open(args.config_file, "r").read()))

    adjust_config(cfg)

    # make sure each worker has a different, yet deterministic seed if specified
    seed = seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank)
    # save seed to config for dump
    cfg.SEED = seed

    # cudnn benchmark has large overhead. It shouldn't be used considering the small size of
    # typical validation set.
    if not (hasattr(args, "eval_only") and args.eval_only):
        torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK

    return cfg
示例#21
0
    def train(self):
        """
        Run training.

        Returns:
            OrderedDict of results, if evaluation is enabled. Otherwise None.
        """
        if self.max_epoch is None:
            logger.info("Starting training from iteration {}".format(
                self.start_iter))
        else:
            logger.info("Starting training from epoch {}".format(
                self.start_epoch))

        super().train(self.start_iter, self.start_epoch, self.max_iter)

        if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process():
            assert hasattr(self, "_last_eval_results"
                           ), "No evaluation results obtained during training!"
            verify_results(self.cfg, self._last_eval_results)
            return self._last_eval_results
示例#22
0
    def evaluate(self):
        if self._distributed:
            comm.synchronize()
            self._predictions = comm.gather(self._predictions, dst=0)
            self._predictions = list(itertools.chain(*self._predictions))

            if not comm.is_main_process():
                return {}

        if len(self._predictions) == 0:
            self._logger.warning(
                "[COCOEvaluator] Did not receive valid predictions.")
            return {}

        if self._output_dir:
            PathManager.mkdirs(self._output_dir)
            file_path = os.path.join(self._output_dir,
                                     "instances_predictions.pth")
            with PathManager.open(file_path, "wb") as f:
                torch.save(self._predictions, f)

        self._results = OrderedDict()
        if "proposals" in self._predictions[0]:
            self._eval_box_proposals()
        if "instances" in self._predictions[0]:
            self._eval_predictions(set(self._tasks))

        if self._dump:
            extra_infos = {
                "title": os.path.basename(os.getcwd()),
                "seed": self.cfg.SEED,
            }
            _dump_to_markdown(extra_infos, self._dump_infos)

        # Copy so the caller can do whatever with results
        return copy.deepcopy(self._results)
示例#23
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DefaultCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
    )
    start_iter = (
        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
    )
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
    )

    writers = (
        [
            CommonMetricPrinter(max_iter),
            JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
            TensorboardXWriter(cfg.OUTPUT_DIR),
        ]
        if comm.is_main_process()
        else []
    )

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    data_loader = build_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss for loss in loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
            scheduler.step()

            if (
                cfg.TEST.EVAL_PERIOD > 0
                and iteration % cfg.TEST.EVAL_PERIOD == 0
                and iteration != max_iter
            ):
                do_test(cfg, model)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
示例#24
0
    def evaluate(self):
        """
        Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval):

        * Mean intersection-over-union averaged across classes (mIoU)
        * Frequency Weighted IoU (fwIoU)
        * Mean pixel accuracy averaged across classes (mACC)
        * Pixel Accuracy (pACC)
        """
        if self._distributed:
            comm.synchronize()
            conf_matrix_list = comm.all_gather(self._conf_matrix)
            self._predictions = comm.all_gather(self._predictions)
            self._predictions = list(itertools.chain(*self._predictions))
            if not comm.is_main_process():
                return

            self._conf_matrix = np.zeros_like(self._conf_matrix)
            for conf_matrix in conf_matrix_list:
                self._conf_matrix += conf_matrix

        if self._output_dir:
            PathManager.mkdirs(self._output_dir)
            file_path = os.path.join(self._output_dir,
                                     "sem_seg_predictions.json")
            with PathManager.open(file_path, "w") as f:
                f.write(json.dumps(self._predictions))

        acc = np.zeros(self._num_classes, dtype=np.float)
        iou = np.zeros(self._num_classes, dtype=np.float)
        tp = self._conf_matrix.diagonal()[:-1].astype(np.float)
        pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float)
        class_weights = pos_gt / np.sum(pos_gt)
        pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float)
        acc_valid = pos_gt > 0
        acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid]
        iou_valid = (pos_gt + pos_pred) > 0
        union = pos_gt + pos_pred - tp
        iou[acc_valid] = tp[acc_valid] / union[acc_valid]
        macc = np.sum(acc) / np.sum(acc_valid)
        miou = np.sum(iou) / np.sum(iou_valid)
        fiou = np.sum(iou * class_weights)
        pacc = np.sum(tp) / np.sum(pos_gt)

        res = {}
        res["mIoU"] = 100 * miou
        res["fwIoU"] = 100 * fiou
        res["mACC"] = 100 * macc
        res["pACC"] = 100 * pacc

        if self._output_dir:
            file_path = os.path.join(self._output_dir,
                                     "sem_seg_evaluation.pth")
            with PathManager.open(file_path, "wb") as f:
                torch.save(res, f)
        results = OrderedDict({"sem_seg": res})

        small_table = create_small_table(res)
        self._logger.info("Evaluation results for sem_seg: \n" + small_table)

        if self._dump:
            dump_info_one_task = {
                "task": "sem_seg",
                "tables": [small_table],
            }
            _dump_to_markdown([dump_info_one_task])

        return results
示例#25
0
    def preprocess_image(self, batched_inputs, training):
        """
        Normalize, pad and batch the input images.
        """
        images = [x["image"].to(self.device) for x in batched_inputs]
        bs = len(images)
        images = [self.normalizer(x) for x in images]

        images = ImageList.from_tensors(images,
                                        size_divisibility=0,
                                        pad_ref_long=True)

        # sync image size for all gpus
        comm.synchronize()
        if training and self.iter % self.change_iter == 0:
            if self.iter < self.max_iter - 20000:
                meg = torch.LongTensor(1).to(self.device)
                comm.synchronize()
                if comm.is_main_process():
                    size = np.random.choice(self.multi_size)
                    meg.fill_(size)

                if comm.get_world_size() > 1:
                    comm.synchronize()
                    dist.broadcast(meg, 0)
                self.size = meg.item()

                comm.synchronize()
            else:
                self.size = 608

        if training:

            # resize image inputs
            modes = ['bilinear', 'nearest', 'bicubic', 'area']
            mode = modes[random.randrange(4)]
            if mode == 'bilinear' or mode == 'bicubic':
                images.tensor = F.interpolate(images.tensor,
                                              size=[self.size, self.size],
                                              mode=mode,
                                              align_corners=False)
            else:
                images.tensor = F.interpolate(images.tensor,
                                              size=[self.size, self.size],
                                              mode=mode)

            if "instances" in batched_inputs[0]:
                gt_instances = [
                    x["instances"].to(self.device) for x in batched_inputs
                ]
            elif "targets" in batched_inputs[0]:
                log_first_n(
                    logging.WARN,
                    "'targets' in the model inputs is now renamed to 'instances'!",
                    n=10)
                gt_instances = [
                    x["targets"].to(self.device) for x in batched_inputs
                ]
            else:
                gt_instances = None

            targets = [
                torch.cat([
                    instance.gt_classes.float().unsqueeze(-1),
                    instance.gt_boxes.tensor
                ],
                          dim=-1) for instance in gt_instances
            ]
            labels = torch.zeros((bs, 100, 5))
            for i, target in enumerate(targets):
                labels[i][:target.shape[0]] = target
            labels[:, :, 1:] = labels[:, :, 1:] / 512. * self.size
        else:
            labels = None

        self.iter += 1
        return images, labels
示例#26
0
def default_setup(cfg, args):
    """
    Perform some basic common setups at the beginning of a job, including:

    1. Set up the cvpods logger
    2. Log basic information about environment, cmdline arguments, and config
    3. Backup the config to the output directory

    Args:
        cfg (BaseConfig): the full config to be used
        args (argparse.NameSpace): the command line arguments to be logged
    """
    output_dir = cfg.OUTPUT_DIR
    if comm.is_main_process() and output_dir:
        PathManager.mkdirs(output_dir)

    rank = comm.get_rank()
    # setup_logger(output_dir, distributed_rank=rank, name="cvpods")
    logger = setup_logger(output_dir, distributed_rank=rank)

    logger.info("Rank of current process: {}. World size: {}".format(
        rank, comm.get_world_size()))
    logger.info("Environment info:\n" + collect_env_info())

    logger.info("Command line arguments: " + str(args))
    if hasattr(args, "config_file") and args.config_file != "":
        logger.info("Contents of args.config_file={}:\n{}".format(
            args.config_file,
            PathManager.open(args.config_file, "r").read()))

    logger.info("Running with full config:\n{}".format(cfg))
    base_config = cfg.__class__.__base__()
    logger.info("different config with base class:\n{}".format(
        cfg.show_diff(base_config)))
    # if comm.is_main_process() and output_dir:
    #     # Note: some of our scripts may expect the existence of
    #     # config.yaml in output directory
    #     path = os.path.join(output_dir, "config.yaml")
    #     with PathManager.open(path, "w") as f:
    #         f.write(cfg.dump())
    #     logger.info("Full config saved to {}".format(os.path.abspath(path)))

    # make sure each worker has a different, yet deterministic seed if specified
    seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank)

    # cudnn benchmark has large overhead. It shouldn't be used considering the small size of
    # typical validation set.
    if not (hasattr(args, "eval_only") and args.eval_only):
        torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK

    # dynamic adjust batch_size, steps according to world size
    base_world_size = int(cfg.SOLVER.IMS_PER_BATCH / cfg.SOLVER.IMS_PER_DEVICE)
    world_size = comm.get_world_size()
    ratio = world_size / base_world_size

    cfg.SOLVER.IMS_PER_BATCH = int(ratio * cfg.SOLVER.IMS_PER_BATCH)
    cfg.SOLVER.LR_SCHEDULER.MAX_ITER = int(cfg.SOLVER.LR_SCHEDULER.MAX_ITER /
                                           ratio)

    # Divided by scale ratio when using iterations rather than epochs
    if cfg.SOLVER.LR_SCHEDULER.MAX_EPOCH is None:
        cfg.SOLVER.LR_SCHEDULER.STEPS = list(
            (int(step / ratio) for step in cfg.SOLVER.LR_SCHEDULER.STEPS))
        cfg.SOLVER.CHECKPOINT_PERIOD = int(cfg.SOLVER.CHECKPOINT_PERIOD /
                                           ratio)
        cfg.TEST.EVAL_PERIOD = int(cfg.TEST.EVAL_PERIOD / ratio)

    cfg.SOLVER.OPTIMIZER.BASE_LR = ratio * cfg.SOLVER.OPTIMIZER.BASE_LR

    assert cfg.SOLVER.IMS_PER_BATCH / cfg.SOLVER.IMS_PER_DEVICE == world_size

    return cfg, logger
示例#27
0
    def build_hooks(self):
        """
        Build a list of default hooks, including timing, evaluation,
        checkpointing, lr scheduling, precise BN, writing events.

        Returns:
            list[HookBase]:
        """
        cfg = self.cfg
        # cfg.DATALOADER.NUM_WORKERS = 0  # save some memory and time for PreciseBN

        ret = [
            hooks.OptimizationHook(
                accumulate_grad_steps=cfg.SOLVER.BATCH_SUBDIVISIONS,
                grad_clipper=None,
                mixed_precision=cfg.TRAINER.FP16.ENABLED),
            hooks.LRScheduler(self.optimizer, self.scheduler),
            hooks.IterationTimer(),
            hooks.PreciseBN(
                # Run at the same freq as (but before) evaluation.
                cfg.TEST.EVAL_PERIOD,
                self.model,
                # Build a new data loader to not affect training
                self.build_train_loader(cfg),
                cfg.TEST.PRECISE_BN.NUM_ITER,
            ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
            else None,
        ]

        # Do PreciseBN before checkpointer, because it updates the model and need to
        # be saved by checkpointer.
        # This is not always the best: if checkpointing has a different frequency,
        # some checkpoints may have more precise statistics than others.
        if comm.is_main_process():
            ret.append(
                hooks.PeriodicCheckpointer(self.checkpointer,
                                           cfg.SOLVER.CHECKPOINT_PERIOD,
                                           max_iter=self.max_iter,
                                           max_epoch=self.max_epoch))

        def test_and_save_results():
            self._last_eval_results = self.test(self.cfg, self.model)
            return self._last_eval_results

        def save_best_model():
            key = cfg.TEST.SORT_BY
            assert hasattr(
                self,
                '_last_eval_results'), "Must run after test_and_save_results()"
            max_value = 0.0 if self._max_eval_results is None else flatten_results_dict(
                self._max_eval_results)[key]
            cur_value = flatten_results_dict(self._last_eval_results)[key]
            if cur_value >= max_value:
                self._max_eval_results = self._last_eval_results
                """ start save checkpoint
                """
                self.checkpointer.save("model_best")
            return None

        # Do evaluation after checkpointer, because then if it fails,
        # we can use the saved checkpoint to debug.
        ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
        if cfg.TEST.get('SORT_BY'):
            # save max metric checkpoint, which means early stopping
            self._max_eval_results = None
            ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, save_best_model))

        if comm.is_main_process():
            # Here the default print/log frequency of each writer is used.
            # run writers in the end, so that evaluation metrics are written
            ret.append(
                hooks.PeriodicWriter(self.build_writers(),
                                     period=self.window_size))
            ret.append(
                hooks.PeriodicWriter(self.build_everystep_writers(), period=1))
        return ret