Exemplo n.º 1
0
def extract_archive_file(archive_fn: str, im_dir: str):
    if not PathManager.exists(im_dir) or not PathManager.ls(im_dir):
        # Dataset is not deployed. Deploy it.
        archive_fns = archive_fn
        # A dataset may be composed of several tgz files, or only one.
        # If one, make it into a list to make the code later more general
        if not isinstance(archive_fns, list):
            archive_fns = [archive_fns]
        logger.info("Extracting datasets {} to local machine at {}".format(
            archive_fns, im_dir))
        if not PathManager.exists(im_dir):
            PathManager.mkdirs(im_dir)

        for archive_fn in archive_fns:
            # Extract the tgz file directly into the target directory,
            # without precopy.
            # Note that the tgz file contains a root directory that
            # we do not want, hence the strip-components=1
            commandUnpack = ("tar -mxzf {src_file} -C {tgt_dir} "
                             "--strip-components=1").format(
                                 src_file=archive_fn, tgt_dir=im_dir)

            assert not subprocess.call(
                shlex.split(commandUnpack)), "Failed to unpack"
            logger.info("Extracted {}".format(archive_fn))
Exemplo n.º 2
0
def convert_to_coco_json(dataset_name, output_file, allow_cached=True):
    """
    Converts dataset into COCO format and saves it to a json file.
    dataset_name must be registered in DatasetCatalog and in detectron2's standard format.

    Args:
        dataset_name:
            reference from the config file to the catalogs
            must be registered in DatasetCatalog and in detectron2's standard format
        output_file: path of json file that will be saved to
        allow_cached: if json file is already present then skip conversion
    """

    # TODO: The dataset or the conversion script *may* change,
    # a checksum would be useful for validating the cached data

    PathManager.mkdirs(os.path.dirname(output_file))
    with file_lock(output_file):
        if PathManager.exists(output_file) and allow_cached:
            logger.warning(
                f"Using previously cached COCO format annotations at '{output_file}'. "
                "You need to clear the cache file if your dataset has been modified."
            )
        else:
            logger.info(f"Converting annotations of dataset '{dataset_name}' to COCO format ...)")
            coco_dict = convert_to_coco_dict(dataset_name)

            logger.info(f"Caching COCO format annotations at '{output_file}' ...")
            tmp_file = output_file + ".tmp"
            with PathManager.open(tmp_file, "w") as f:
                json.dump(coco_dict, f)
            shutil.move(tmp_file, output_file)
Exemplo n.º 3
0
    def save_protobuf(self, output_dir):
        """
        Save the model as caffe2's protobuf format.
        It saves the following files:

            * "model.pb": definition of the graph. Can be visualized with
              tools like `netron <https://github.com/lutzroeder/netron>`_.
            * "model_init.pb": model parameters
            * "model.pbtxt": human-readable definition of the graph. Not
              needed for deployment.

        Args:
            output_dir (str): the output directory to save protobuf files.
        """
        logger = logging.getLogger(__name__)
        logger.info("Saving model to {} ...".format(output_dir))
        if not PathManager.exists(output_dir):
            PathManager.mkdirs(output_dir)

        with PathManager.open(os.path.join(output_dir, "model.pb"), "wb") as f:
            f.write(self._predict_net.SerializeToString())
        with PathManager.open(os.path.join(output_dir, "model.pbtxt"),
                              "w") as f:
            f.write(str(self._predict_net))
        with PathManager.open(os.path.join(output_dir, "model_init.pb"),
                              "wb") as f:
            f.write(self._init_net.SerializeToString())
Exemplo n.º 4
0
 def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"):
     self.gt_json = ann_file
     self.gt_folder = ann_folder
     if utils.is_main_process():
         if not PathManager.exists(output_dir):
             PathManager.mkdir(output_dir)
     self.output_dir = output_dir
     self.predictions = []
Exemplo n.º 5
0
    def _get_test_image(self):
        try:
            file_name = DatasetCatalog.get("coco_2017_train")[0]["file_name"]
            if not PathManager.exists(file_name):
                raise FileNotFoundError()
        except IOError:
            # for public CI to run
            file_name = "http://images.cocodataset.org/train2017/000000000009.jpg"

        with PathManager.open(file_name, "rb") as f:
            buf = f.read()
        img = cv2.imdecode(np.frombuffer(buf, dtype=np.uint8), cv2.IMREAD_COLOR)
        assert img is not None, file_name
        return torch.from_numpy(img.transpose(2, 0, 1))
Exemplo n.º 6
0
 def test_read_sem_seg(self):
     cityscapes_dir = MetadataCatalog.get("cityscapes_fine_sem_seg_val").gt_dir
     sem_seg_gt_path = os.path.join(
         cityscapes_dir, "frankfurt", "frankfurt_000001_083852_gtFine_labelIds.png"
     )
     if not PathManager.exists(sem_seg_gt_path):
         raise unittest.SkipTest(
             "Semantic segmentation ground truth {} not found.".format(sem_seg_gt_path)
         )
     sem_seg = detection_utils.read_image(sem_seg_gt_path, "L")
     self.assertEqual(sem_seg.ndim, 3)
     self.assertEqual(sem_seg.shape[2], 1)
     self.assertEqual(sem_seg.dtype, np.uint8)
     self.assertEqual(sem_seg.max(), 32)
     self.assertEqual(sem_seg.min(), 1)
Exemplo n.º 7
0
 def __init__(self, root, split, transforms=None):
     super(ADE20KParsing, self).__init__(root)
     # assert exists and prepare dataset automatically
     assert PathManager.exists(root), "Please setup the dataset"
     self.images, self.masks = _get_ade20k_pairs(root, split)
     assert len(self.images) == len(self.masks)
     if len(self.images) == 0:
         raise (
             RuntimeError(
                 "Found 0 images in subfolders of: \
             "
                 + root
                 + "\n"
             )
         )
     self._transforms = transforms
Exemplo n.º 8
0
    def save_protobuf(self, output_dir):
        """
        Save the model as caffe2's protobuf format.

        Args:
            output_dir (str): the output directory to save protobuf files.
        """
        logger = logging.getLogger(__name__)
        logger.info("Saving model to {} ...".format(output_dir))
        if not PathManager.exists(output_dir):
            PathManager.mkdirs(output_dir)

        with PathManager.open(os.path.join(output_dir, "model.pb"), "wb") as f:
            f.write(self._predict_net.SerializeToString())
        with PathManager.open(os.path.join(output_dir, "model.pbtxt"), "w") as f:
            f.write(str(self._predict_net))
        with PathManager.open(os.path.join(output_dir, "model_init.pb"), "wb") as f:
            f.write(self._init_net.SerializeToString())
Exemplo n.º 9
0
def get_sample_coco_image(tensor=True):
    """
    Args:
        tensor (bool): if True, returns 3xHxW tensor.
            else, returns a HxWx3 numpy array.

    Returns:
        an image, in BGR color.
    """
    try:
        file_name = DatasetCatalog.get("coco_2017_val_100")[0]["file_name"]
        if not PathManager.exists(file_name):
            raise FileNotFoundError()
    except IOError:
        # for public CI to run
        file_name = "http://images.cocodataset.org/train2017/000000000009.jpg"
    ret = read_image(file_name, format="BGR")
    if tensor:
        ret = torch.from_numpy(np.ascontiguousarray(ret.transpose(2, 0, 1)))
    return ret
Exemplo n.º 10
0
def main(
    cfg: CfgNode,
    output_dir: str,
    task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask,
    eval_only: bool = False,
    num_machines: int = 1,
    num_processes: int = 1,
) -> TrainOutput:
    """Main function for launching a training with lightning trainer
    Args:
        cfg: D2go config node
        num_machines: Number of nodes used for distributed training
        num_processes: Number of processes on each node.
        eval_only: True if run evaluation only.
    """
    # FIXME: make comm.get_world_size() work properly.
    setup_after_launch(cfg, output_dir, _scale_world_size=False)
    auto_scale_world_size(cfg, new_world_size=num_machines * num_processes)

    task = task_cls.from_config(cfg, eval_only)
    trainer_params = get_trainer_params(cfg, num_machines, num_processes)

    last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt")
    if PathManager.exists(last_checkpoint):
        # resume training from checkpoint
        trainer_params["resume_from_checkpoint"] = last_checkpoint
        logger.info(f"Resuming training from checkpoint: {last_checkpoint}.")

    trainer = pl.Trainer(**trainer_params)
    model_configs = None
    if eval_only:
        do_test(trainer, task)
    else:
        model_configs = do_train(cfg, trainer, task)

    return TrainOutput(
        output_dir=cfg.OUTPUT_DIR,
        tensorboard_log_dir=trainer_params["logger"].log_dir,
        accuracy=task.eval_res,
        model_configs=model_configs,
    )
Exemplo n.º 11
0
def main(
    cfg: CfgNode,
    output_dir: Optional[str] = None,
    task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask,
    eval_only: bool = False,
    num_machines: int = 1,
    num_gpus: int = 0,
    num_processes: int = 1,
) -> TrainOutput:
    """Main function for launching a training with lightning trainer
    Args:
        cfg: D2go config node
        num_machines: Number of nodes used for distributed training
        num_gpus: Number of GPUs to train on each node
        num_processes: Number of processes on each node.
            NOTE: Automatically set to the number of GPUs when using DDP.
            Set a value greater than 1 to mimic distributed training on CPUs.
        eval_only: True if run evaluation only.
    """
    assert (num_processes == 1 or num_gpus
            == 0), "Only set num_processes > 1 when training on CPUs"

    maybe_override_output_dir(cfg, output_dir)

    task = task_cls.from_config(cfg, eval_only)
    tb_logger = TensorBoardLogger(save_dir=cfg.OUTPUT_DIR)

    trainer_params = {
        # training loop is bounded by max steps, use a large max_epochs to make
        # sure max_steps is met first
        "max_epochs":
        10**8,
        "max_steps":
        cfg.SOLVER.MAX_ITER,
        "val_check_interval":
        cfg.TEST.EVAL_PERIOD
        if cfg.TEST.EVAL_PERIOD > 0 else cfg.SOLVER.MAX_ITER,
        "num_nodes":
        num_machines,
        "gpus":
        num_gpus,
        "num_processes":
        num_processes,
        "accelerator":
        get_accelerator(cfg.MODEL.DEVICE),
        "callbacks":
        _get_trainer_callbacks(cfg),
        "logger":
        tb_logger,
        "num_sanity_val_steps":
        0,
        "progress_bar_refresh_rate":
        10,
    }

    last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt")
    if PathManager.exists(last_checkpoint):
        # resume training from checkpoint
        trainer_params["resume_from_checkpoint"] = last_checkpoint
        logger.info(f"Resuming training from checkpoint: {last_checkpoint}.")

    trainer = pl.Trainer(**trainer_params)
    model_configs = None
    if eval_only:
        do_test(trainer, task)
    else:
        model_configs = do_train(cfg, trainer, task)

    return TrainOutput(
        output_dir=cfg.OUTPUT_DIR,
        tensorboard_log_dir=tb_logger.log_dir,
        accuracy=task.eval_res,
        model_configs=model_configs,
    )
Exemplo n.º 12
0
def fetch_checkpoints_till_final(checkpoint_dir):
    """
    A generator that yields all checkpoint paths under the given directory, it'll
        keep refreshing until model_final is found.
    """

    MIN_SLEEP_INTERVAL = 1.0  # in seconds
    MAX_SLEEP_INTERVAL = 60.0  # in seconds
    sleep_interval = MIN_SLEEP_INTERVAL

    finished_checkpoints = set()

    def _add_and_log(path):
        finished_checkpoints.add(path)
        logger.info("Found checkpoint: {}".format(path))
        return path

    def _log_and_sleep(sleep_interval):
        logger.info(
            "Sleep {} seconds while waiting for model_final.pth".format(sleep_interval)
        )
        time.sleep(sleep_interval)
        return min(sleep_interval * 2, MAX_SLEEP_INTERVAL)

    def _get_lightning_checkpoints(path: str):
        return [
            os.path.join(path, x)
            for x in PathManager.ls(path)
            if x.endswith(ModelCheckpoint.FILE_EXTENSION)
            and not x.startswith(ModelCheckpoint.CHECKPOINT_NAME_LAST)
        ]

    while True:
        if not PathManager.exists(checkpoint_dir):
            sleep_interval = _log_and_sleep(sleep_interval)
            continue

        checkpoint_paths = DetectionCheckpointer(
            None, save_dir=checkpoint_dir
        ).get_all_checkpoint_files()
        checkpoint_paths.extend(_get_lightning_checkpoints(checkpoint_dir))

        final_model_path = None
        periodic_checkpoints = []

        for path in sorted(checkpoint_paths):
            if path.endswith("model_final.pth") or path.endswith("model_final.ckpt"):
                final_model_path = path
                continue

            if path.endswith(ModelCheckpoint.FILE_EXTENSION):
                # Lightning checkpoint
                model_iter = int(
                    re.findall(
                        r"(?<=step=)\d+(?={})".format(ModelCheckpoint.FILE_EXTENSION),
                        path,
                    )[0]
                )
            else:
                model_iter = int(re.findall(r"(?<=model_)\d+(?=\.pth)", path)[0])
            periodic_checkpoints.append((path, model_iter))

        periodic_checkpoints = [
            pc for pc in periodic_checkpoints if pc[0] not in finished_checkpoints
        ]
        periodic_checkpoints = sorted(periodic_checkpoints, key=lambda x: x[1])
        for pc in periodic_checkpoints:
            yield _add_and_log(pc[0])
            sleep_interval = MIN_SLEEP_INTERVAL

        if final_model_path is None:
            sleep_interval = _log_and_sleep(sleep_interval)
        else:
            yield _add_and_log(final_model_path)
            break