示例#1
0
def test_illegal_type() -> None:
    checkpoint_config = {"type": 4}
    with pytest.raises(TypeError, match="must be a string"):
        env = test_util.get_dummy_env()
        tensorboard.build(
            env.det_cluster_id, env.det_experiment_id, env.det_trial_id, checkpoint_config
        )
示例#2
0
def test_unknown_type() -> None:
    checkpoint_config = {
        "type": "unknown",
        "host_path": HOST_PATH,
    }
    with pytest.raises(TypeError, match="Unknown storage type: unknown"):
        tensorboard.build(test_util.get_dummy_env(), checkpoint_config)
def test_s3_build_missing_param() -> None:
    conf = copy.deepcopy(default_conf)
    del conf["bucket"]

    with pytest.raises(KeyError):
        env = test_util.get_dummy_env()
        tensorboard.build(env.det_cluster_id, env.det_experiment_id,
                          env.det_trial_id, conf)
def test_invalid_prefix(monkeypatch: monkeypatch.MonkeyPatch) -> None:
    env = test_util.get_dummy_env()
    conf = copy.deepcopy(default_conf)
    conf["prefix"] = "my/invalid/../prefix"

    with pytest.raises(ValueError):
        tensorboard.build(env.det_cluster_id, env.det_experiment_id,
                          env.det_trial_id, conf)
示例#5
0
def test_unknown_type() -> None:
    checkpoint_config = {
        "type": "unknown",
        "host_path": HOST_PATH,
    }
    with pytest.raises(TypeError, match="Unknown storage type: unknown"):
        env = test_util.get_dummy_env()
        tensorboard.build(env.det_cluster_id, env.det_experiment_id,
                          env.det_trial_id, checkpoint_config)
def test_s3_build(prefix: Optional[str]) -> None:
    env = test_util.get_dummy_env()
    conf = copy.deepcopy(default_conf)
    conf["prefix"] = prefix
    manager = tensorboard.build(env.det_cluster_id, env.det_experiment_id,
                                env.det_trial_id, conf)
    assert isinstance(manager, tensorboard.S3TensorboardManager)
示例#7
0
def test_getting_manager_instance(tmp_path: pathlib.Path) -> None:
    checkpoint_config = {"type": "shared_fs", "host_path": HOST_PATH}
    env = test_util.get_dummy_env()
    manager = tensorboard.build(
        env.det_cluster_id, env.det_experiment_id, env.det_trial_id, checkpoint_config
    )
    assert isinstance(manager, tensorboard.SharedFSTensorboardManager)
def prepare_tensorboard(
    env: det.EnvContext,
    container_path: Optional[str] = None,
) -> Tuple[tensorboard.TensorboardManager, tensorboard.BatchMetricWriter]:
    tensorboard_mgr = tensorboard.build(
        env.det_cluster_id,
        env.det_experiment_id,
        env.det_trial_id,
        env.experiment_config["checkpoint_storage"],
        container_path,
    )
    try:
        from determined.tensorboard.metric_writers import tensorflow

        writer: tensorboard.MetricWriter = tensorflow.TFWriter()

    except ModuleNotFoundError:
        logging.warning("Tensorflow writer not found")
        from determined.tensorboard.metric_writers import pytorch

        writer = pytorch.TorchWriter()

    return (
        tensorboard_mgr,
        tensorboard.BatchMetricWriter(writer),
    )
示例#9
0
def test_getting_manager_instance(tmp_path: pathlib.Path) -> None:
    checkpoint_config = {
        "type": "shared_fs",
        "host_path": HOST_PATH,
        "container_path": tmp_path
    }
    manager = tensorboard.build(test_util.get_dummy_env(), checkpoint_config)
    assert isinstance(manager, SharedFSTensorboardManager)
示例#10
0
def test_s3_faulty_lifecycle(monkeypatch: monkeypatch.MonkeyPatch) -> None:
    monkeypatch.setattr("boto3.client", s3.s3_faulty_client)
    env = test_util.get_dummy_env()
    manager = tensorboard.build(env.det_cluster_id, env.det_experiment_id,
                                env.det_trial_id, default_conf)

    with pytest.raises(exceptions.S3UploadFailedError):
        manager.sync()
示例#11
0
def test_setting_optional_variable(tmp_path: pathlib.Path) -> None:
    checkpoint_config = {
        "type": "shared_fs",
        "base_path": "test_value",
        "host_path": HOST_PATH,
    }
    manager = tensorboard.build(test_util.get_dummy_env(), checkpoint_config)
    assert isinstance(manager, tensorboard.SharedFSTensorboardManager)
    assert manager.base_path == pathlib.Path("test_value/tensorboard")
示例#12
0
def test_setting_storage_path(tmp_path: pathlib.Path) -> None:
    checkpoint_config = {
        "type": "shared_fs",
        "host_path": str(HOST_PATH),
        "storage_path": str(STORAGE_PATH),
    }
    manager = tensorboard.build(test_util.get_dummy_env(), checkpoint_config)
    assert isinstance(manager, tensorboard.SharedFSTensorboardManager)
    assert manager.storage_path == STORAGE_PATH
示例#13
0
def test_s3_lifecycle(monkeypatch: monkeypatch.MonkeyPatch) -> None:
    monkeypatch.setattr("boto3.client", s3.s3_client)
    manager = tensorboard.build(test_util.get_dummy_env(), default_conf)
    assert isinstance(manager, tensorboard.S3TensorboardManager)

    manager.sync()
    expected = (
        "s3_bucket",
        "uuid-123/tensorboard/experiment/1/trial/1/events.out.tfevents.example",
    )
    assert expected in manager.client.objects
示例#14
0
def test_list_nonexistent_directory(tmp_path: pathlib.Path) -> None:
    base_path = "/non-existent-directory"
    checkpoint_config = {
        "type": "shared_fs",
        "base_path": base_path,
        "host_path": HOST_PATH,
        "container_path": tmp_path,
    }

    manager = tensorboard.build(test_util.get_dummy_env(), checkpoint_config)
    assert not pathlib.Path(base_path).exists()
    assert manager.list_tfevents() == []
示例#15
0
def test_list_directory(tmp_path: pathlib.Path) -> None:
    checkpoint_config = {
        "type": "shared_fs",
        "base_path": BASE_PATH,
        "host_path": HOST_PATH,
        "container_path": tmp_path,
    }
    manager = tensorboard.build(test_util.get_dummy_env(), checkpoint_config)

    full_event_path = BASE_PATH.joinpath("tensorboard",
                                         "events.out.tfevents.example")

    assert set(manager.list_tfevents()) == {full_event_path}
示例#16
0
def test_build_with_container_path(tmp_path: pathlib.Path) -> None:
    checkpoint_config = {
        "type": "shared_fs",
        "host_path": str(HOST_PATH),
        "storage_path": str(STORAGE_PATH),
    }
    env = test_util.get_dummy_env()
    manager = tensorboard.build(
        env.det_cluster_id,
        env.det_experiment_id,
        env.det_trial_id,
        checkpoint_config,
        container_path=str(tmp_path),
    )
    assert isinstance(manager, tensorboard.SharedFSTensorboardManager)
    assert manager.storage_path == tmp_path.joinpath("test_storage_path")
def prepare_tensorboard(
    env: det.EnvContext,
) -> Tuple[tensorboard.TensorboardManager, tensorboard.BatchMetricWriter]:
    tensorboard_mgr = tensorboard.build(env, env.experiment_config["checkpoint_storage"])
    try:
        from determined.tensorboard.metric_writers import pytorch

        writer: tensorboard.MetricWriter = pytorch.TorchWriter()

    except ImportError:
        print("PYTORCH WRITER NOT FOUND")
        from determined.tensorboard.metric_writers import tensorflow

        writer = tensorflow.TFWriter()

    return (
        tensorboard_mgr,
        tensorboard.BatchMetricWriter(writer, env.experiment_config.batches_per_step()),
    )
示例#18
0
def test_s3_lifecycle(monkeypatch: monkeypatch.MonkeyPatch,
                      prefix: Optional[str]) -> None:
    monkeypatch.setattr("boto3.client", s3.s3_client)
    env = test_util.get_dummy_env()
    conf = copy.deepcopy(default_conf)
    conf["prefix"] = prefix
    manager = tensorboard.build(env.det_cluster_id, env.det_experiment_id,
                                env.det_trial_id, conf)
    assert isinstance(manager, tensorboard.S3TensorboardManager)

    tfevents_path = "uuid-123/tensorboard/experiment/1/trial/1/events.out.tfevents.example"

    manager.sync()
    if prefix is not None:
        tfevents_path = os.path.join(
            os.path.normpath(prefix).lstrip("/"), tfevents_path)

    expected = (
        "s3_bucket",
        tfevents_path,
    )
    assert expected in manager.client.objects
示例#19
0
def test_missing_type() -> None:
    with pytest.raises(TypeError, match="Missing 'type' parameter"):
        tensorboard.build(test_util.get_dummy_env(), {})
示例#20
0
def test_s3_build_missing_param() -> None:
    conf = copy.deepcopy(default_conf)
    del conf["bucket"]

    with pytest.raises(KeyError):
        tensorboard.build(test_util.get_dummy_env(), conf)
示例#21
0
def test_missing_type() -> None:
    with pytest.raises(TypeError, match="Missing 'type' parameter"):
        env = test_util.get_dummy_env()
        tensorboard.build(env.det_cluster_id, env.det_experiment_id, env.det_trial_id, {})
示例#22
0
def test_s3_build() -> None:
    manager = tensorboard.build(test_util.get_dummy_env(), default_conf)
    assert isinstance(manager, tensorboard.S3TensorboardManager)
示例#23
0
def test_illegal_type() -> None:
    checkpoint_config = {"type": 4}
    with pytest.raises(TypeError, match="must be a string"):
        tensorboard.build(test_util.get_dummy_env(), checkpoint_config)
示例#24
0
def main(argv: List[str]) -> None:
    parser = argparse.ArgumentParser(description="Determined checkpoint GC")

    parser.add_argument(
        "--version",
        action="version",
        version="Determined checkpoint GC, version {}".format(det.__version__),
    )
    parser.add_argument("--experiment-id", help="The experiment ID to run the GC job for")
    parser.add_argument(
        "--log-level",
        default=os.getenv("DET_LOG_LEVEL", "INFO"),
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
        help="Set the logging level",
    )
    parser.add_argument(
        "--storage-config",
        type=json_file_arg,
        default=os.getenv("DET_STORAGE_CONFIG", {}),
        help="Storage config (JSON-formatted file)",
    )
    parser.add_argument(
        "--delete",
        type=json_file_arg,
        default=os.getenv("DET_DELETE", []),
        help="Checkpoints to delete (JSON-formatted file)",
    )
    parser.add_argument(
        "--delete-tensorboards",
        action="store_true",
        default=os.getenv("DET_DELETE_TENSORBOARDS", False),
        help="Delete Tensorboards from storage",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        default=("DET_DRY_RUN" in os.environ),
        help="Do not actually delete any checkpoints from storage",
    )

    args = parser.parse_args(argv)

    logging.basicConfig(
        level=args.log_level, format="%(asctime)s:%(module)s:%(levelname)s: %(message)s"
    )

    logging.info("Determined checkpoint GC, version {}".format(det.__version__))

    storage_config = args.storage_config
    logging.info("Using checkpoint storage: {}".format(storage_config))

    manager = storage.build(storage_config, container_path=constants.SHARED_FS_CONTAINER_PATH)

    storage_ids = [c["uuid"] for c in args.delete["checkpoints"]]

    delete_checkpoints(manager, storage_ids, dry_run=args.dry_run)

    if args.delete_tensorboards:
        tb_manager = tensorboard.build(
            os.environ["DET_CLUSTER_ID"],
            args.experiment_id,
            None,
            storage_config,
            container_path=constants.SHARED_FS_CONTAINER_PATH,
        )
        delete_tensorboards(tb_manager, dry_run=args.dry_run)
示例#25
0
def init(
    *,
    distributed: Optional[core.DistributedContext] = None,
    # TODO: figure out a better way to deal with checkpointing in the local training case.
    storage_manager: Optional[storage.StorageManager] = None,
    preempt_mode: core.PreemptMode = core.PreemptMode.WorkersAskChief,
    tensorboard_mode: core.TensorboardMode = core.TensorboardMode.AUTO,
) -> Context:
    """
    ``core.init()`` builds a :class:`core.Context <determined.core.Context>` for use with the Core
    API.

    Always use ``with core.init() as context`` instead of instantiating a ``core.Context`` directly.
    Certain components of the Core API may be configured by passing arguments to ``core.init()``.
    The only arg that is required is a ``DistributedContext``, and even that is only required for
    for multi-slot tasks.

    All of your training must occur within the scope of the ``with core.init() as core_context``, as
    there are resources necessary for training which start in the ``core.Context``'s ``__enter__``
    method and must be cleaned up in its ``__exit__()`` method.

    Arguments:
        distributed (``core.DistributedContext``, optional): Passing a ``DistributedContext`` is
            required for multi-slot training, but unnecessary for single-slot training.  Defaults to
            ``None``.
        preempt_mode (``core.PreemptMode``, optional): Configure the calling pattern for the
            ``core_context.preempt.should_preempt()`` method.  See
            :class:`~determined.core.PreemptMode` for more detail.  Defaults to ``WorkersAskChief``.
        storage_manager: Internal use only.
        tensorboard_mode (``core.TensorboardMode``, optional): Define how Tensorboard
            metrics and profiling data are retained. See
            :class:`~determined.core.TensorboardMode`` for more detail. Defaults to ``AUTO``.
    """
    info = det.get_cluster_info()
    if info is None:
        return _dummy_init(distributed=distributed, storage_manager=storage_manager)

    # We are on the cluster.
    cert = certs.default_load(info.master_url)
    session = Session(info.master_url, None, None, cert, max_retries=get_max_retries_config())

    if distributed is None:
        if len(info.container_addrs) > 1 or len(info.slot_ids) > 1:
            raise ValueError("you must provide a valid DistributedContext for a multi-slot task")

    distributed = distributed or core.DummyDistributedContext()

    preempt = core.PreemptContext(session, info.allocation_id, distributed, preempt_mode)

    # At present, we only support tensorboards in Trial tasks.
    tbd_writer = None

    train = None
    searcher = None

    if info.task_type == "TRIAL":
        # Prepare the tensorboard hooks.
        tensorboard_manager = tensorboard.build(
            info.cluster_id,
            str(info.trial.experiment_id),
            str(info.trial.trial_id),
            info.trial._config["checkpoint_storage"],
            container_path=constants.SHARED_FS_CONTAINER_PATH,
        )
        if tensorboard_mode == core.TensorboardMode.AUTO:
            tbd_writer = tensorboard.get_metric_writer()

        train = core.TrainContext(
            session,
            info.trial.trial_id,
            info.trial._trial_run_id,
            info.trial.experiment_id,
            distributed,
            tensorboard_mode,
            tensorboard_manager,
            tbd_writer,
        )
        units = core._parse_searcher_units(info.trial._config)
        searcher = core.SearcherContext(
            session,
            distributed,
            info.trial.trial_id,
            info.trial._trial_run_id,
            info.allocation_id,
            units,
        )

        if storage_manager is None:
            storage_manager = storage.build(
                info.trial._config["checkpoint_storage"],
                container_path=constants.SHARED_FS_CONTAINER_PATH,
            )

        checkpoint = core.CheckpointContext(
            distributed,
            storage_manager,
            session,
            info.task_id,
            info.allocation_id,
            tensorboard_mode,
            tensorboard_manager,
        )

    else:
        # TODO: support checkpointing for non-trial tasks.
        if storage_manager is None:
            base_path = appdirs.user_data_dir("determined")
            logger.info("no storage_manager provided; storing checkpoints in {base_path}")
            storage_manager = storage.SharedFSStorageManager(base_path)
        checkpoint = core.DummyCheckpointContext(distributed, storage_manager)

    _install_stacktrace_on_sigusr1()

    return Context(
        distributed=distributed,
        checkpoint=checkpoint,
        preempt=preempt,
        train=train,
        searcher=searcher,
    )