Exemplo n.º 1
0
    def _find_shared_fs_path(self) -> pathlib.Path:
        """Attempt to find the path of the checkpoint if being configured to shared fs.
        This function assumes the host path of the shared fs exists.
        """
        host_path = self.experiment_config["checkpoint_storage"]["host_path"]
        storage_path = self.experiment_config["checkpoint_storage"].get(
            "storage_path")
        potential_paths = [
            pathlib.Path(shared._full_storage_path(host_path, storage_path),
                         self.uuid),
            pathlib.Path(
                shared._full_storage_path(host_path, storage_path,
                                          constants.SHARED_FS_CONTAINER_PATH),
                self.uuid,
            ),
        ]

        for path in potential_paths:
            if path.exists():
                return path

        raise FileNotFoundError(
            "Checkpoint {} not found in {}. This error could be caused by not having "
            "the same shared file system mounted on the local machine as the experiment "
            "checkpoint storage configuration.".format(self.uuid,
                                                       potential_paths))
Exemplo n.º 2
0
def build(env: det.EnvContext,
          checkpoint_config: Dict[str, Any],
          container_path: Optional[str] = None) -> base.TensorboardManager:
    """
    Return a tensorboard manager defined by the value of the `type` key in
    the configuration dictionary. Throws a `TypeError` if no tensorboard manager
    with `type` is defined.

    container_path, if set, will replace the host_path when determining the storage_path for the
    SharedFSTensorboardManager.
    """
    type_name = checkpoint_config.get("type")

    if not type_name:
        raise TypeError("Missing 'type' parameter of storage configuration")

    if not isinstance(type_name, str):
        raise TypeError(
            "`type` parameter of storage configuration must be a string")

    base_path = get_base_path(checkpoint_config, manager=True)
    sync_path = get_sync_path(env)

    if type_name == "shared_fs":
        host_path = checkpoint_config["host_path"]
        storage_path = checkpoint_config.get("storage_path")
        return shared.SharedFSTensorboardManager(
            _full_storage_path(host_path, storage_path, container_path),
            base_path,
            sync_path,
        )

    elif type_name == "gcs":
        return gcs.GCSTensorboardManager(checkpoint_config["bucket"],
                                         base_path, sync_path)

    elif type_name == "s3":
        return s3.S3TensorboardManager(
            checkpoint_config["bucket"],
            checkpoint_config.get("access_key", None),
            checkpoint_config.get("secret_key", None),
            checkpoint_config.get("endpoint_url", None),
            base_path,
            sync_path,
        )

    # Return the base_path.TensorboardManager for known but unsupported storage
    # backends. This will result in a noop action when the workload_manager
    # attempts to sync the tfevent files to persistent storage.
    elif type_name == "hdfs":
        return hdfs.HDFSTensorboardManager(
            checkpoint_config["hdfs_url"],
            checkpoint_config["hdfs_path"],
            checkpoint_config.get("user"),
            base_path,
            sync_path,
        )

    else:
        raise TypeError(f"Unknown storage type: {type_name}")
Exemplo n.º 3
0
    def _find_shared_fs_path(self) -> pathlib.Path:
        host_path = self.storage_config["host_path"]
        storage_path = self.storage_config.get("storage_path")
        potential_paths = [
            [shared._full_storage_path(host_path, storage_path), self.uuid],
            [
                shared._full_storage_path(host_path, storage_path,
                                          constants.SHARED_FS_CONTAINER_PATH),
                self.uuid,
            ],
        ]

        for path in potential_paths:
            maybe_ckpt = pathlib.Path(*path)
            if maybe_ckpt.exists():
                return maybe_ckpt

        raise FileNotFoundError("Checkpoint {} not found".format(self.uuid))
Exemplo n.º 4
0
def test_full_storage_path() -> None:
    with pytest.raises(check.CheckFailedError, match="`host_path` must be an absolute path"):
        shared._full_storage_path("host_path")

    path = shared._full_storage_path("/host_path")
    assert path == "/host_path"

    path = shared._full_storage_path("/host_path", container_path="cpath")
    assert path == "cpath"

    path = shared._full_storage_path("/host_path", "storage_path")
    assert path == "/host_path/storage_path"

    path = shared._full_storage_path("/host_path", "storage_path", container_path="cpath")
    assert path == "cpath/storage_path"

    path = shared._full_storage_path("/host_path", storage_path="/host_path/storage_path")
    assert path == "/host_path/storage_path"

    path = shared._full_storage_path("/host_path", "/host_path/storage_path", "cpath")
    assert path == "cpath/storage_path"

    with pytest.raises(check.CheckFailedError, match="must be a subdirectory"):
        shared._full_storage_path("/host_path", storage_path="/storage_path")

    with pytest.raises(check.CheckFailedError, match="must be a subdirectory"):
        shared._full_storage_path("/host_path", storage_path="/host_path/../test")

    with pytest.raises(check.CheckFailedError, match="must be a subdirectory"):
        shared._full_storage_path("/host_path", storage_path="../test")