Exemplo n.º 1
0
def test_unexpected_params() -> None:
    config = {
        "type": "noop",
        "base_path": "test",
        "require": "value",
        "optional": "test"
    }
    with pytest.raises(TypeError,
                       match="unexpected keyword argument "
                       "'require'"):
        storage.build(config)
Exemplo n.º 2
0
def to_delete(request: Any, config: Dict[str, Any]) -> List[Dict[str, Any]]:
    manager = storage.build(config["checkpoint_storage"])
    metadata = [manager.store(StorableFixture()) for _ in range(request.param)]

    host_path = config["checkpoint_storage"]["host_path"]
    assert len(os.listdir(host_path)) == request.param
    return [simplejson.loads(util.json_encode(m)) for m in metadata]
Exemplo n.º 3
0
def download(master: str, trial_id: int, step_id: int, output_dir: str) -> None:
    q = api.GraphQLQuery(master)

    step = q.op.steps_by_pk(trial_id=trial_id, id=step_id)
    step.checkpoint.labels()
    step.checkpoint.resources()
    step.checkpoint.uuid()
    step.trial.experiment.config(path="checkpoint_storage")
    step.trial.experiment_id()

    resp = q.send()

    step = resp.steps_by_pk
    if not step:
        raise ValueError("Trial {} step {} not found".format(trial_id, step_id))

    if not step.checkpoint:
        raise ValueError("Trial {} step {} has no checkpoint".format(trial_id, step_id))

    storage_config = step.trial.experiment.config
    manager = storage.build(storage_config)
    if not (
        isinstance(manager, storage.S3StorageManager)
        or isinstance(manager, storage.GCSStorageManager)
    ):
        raise AssertionError(
            "Downloading from S3 or GCS requires the experiment to be configured with "
            "S3 or GCS checkpointing, {} found instead".format(storage_config["type"])
        )
    metadata = storage.StorageMetadata.from_json(step.checkpoint.__to_json_value__())
    manager.download(metadata, output_dir)
Exemplo n.º 4
0
def build_and_run_training_pipeline(env: det.EnvContext) -> None:

    # Create the socket manager. The socket manager will connect to the master and read messages
    # until it receives the rendezvous_info.
    #
    # TODO(ryan): Pull profiler hooks out of SocketManager and into their own layer.
    with layers.SocketManager(env) as socket_mgr:

        # Create the storage manager. This is used to download the initial checkpoint here in
        # build_training_pipeline and also used by the workload manager to create and store
        # checkpoints during training.
        storage_mgr = storage.build(env.experiment_config["checkpoint_storage"])

        [tensorboard_mgr, tensorboard_writer] = load.prepare_tensorboard(env)

        # Create the workload manager. The workload manager will receive workloads from the
        # socket_mgr, and augment them with some additional arguments. Additionally, the
        # workload manager is responsible for some generic workload hooks for things like timing
        # workloads, preparing checkpoints, and uploading completed checkpoints.  Finally, the
        # workload manager does some sanity checks on response messages that originate from the
        # trial.
        #
        # TODO(ryan): Refactor WorkloadManager into separate layers that do each separate task.
        workload_mgr = layers.build_workload_manager(
            env,
            iter(socket_mgr),
            socket_mgr.get_rendezvous_info(),
            storage_mgr,
            tensorboard_mgr,
            tensorboard_writer,
        )

        hvd_config = horovod.HorovodContext.from_configs(
            env.experiment_config, socket_mgr.get_rendezvous_info(), env.hparams
        )
        logging.info(f"Horovod config: {hvd_config.__dict__}.")

        # Load the checkpoint, if necessary. Any possible sinks to this pipeline will need access
        # to this checkpoint.
        with maybe_load_checkpoint(storage_mgr, env.latest_checkpoint) as load_path:

            # Horovod distributed training is done inside subprocesses.
            if hvd_config.use:
                subproc = layers.SubprocessLauncher(
                    env, iter(workload_mgr), load_path, socket_mgr.get_rendezvous_info(), hvd_config
                )
                subproc.run()
            else:
                if env.experiment_config.debug_enabled():
                    faulthandler.dump_traceback_later(30, repeat=True)

                controller = load.prepare_controller(
                    env,
                    iter(workload_mgr),
                    load_path,
                    socket_mgr.get_rendezvous_info(),
                    hvd_config,
                )
                controller.run()
Exemplo n.º 5
0
def test_setting_optional_variable() -> None:
    config = {
        "type": "noop",
        "base_path": "test",
        "required": "value",
        "optional": "test"
    }
    manager = storage.build(config)
    assert isinstance(manager, NoopStorageManager)
    assert manager.required == "value"
    assert manager.optional == "test"
Exemplo n.º 6
0
def main(argv: List[str]) -> None:
    parser = argparse.ArgumentParser(description="Determined checkpoint GC")

    parser.add_argument(
        "--version",
        action="version",
        version="Determined checkpoint GC, version {}".format(det.__version__),
    )
    parser.add_argument(
        "--log-level",
        default=os.getenv("DET_LOG_LEVEL", "INFO"),
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
        help="Set the logging level",
    )
    parser.add_argument(
        "--experiment-config",
        type=json_file_arg,
        default=os.getenv("DET_EXPERIMENT_CONFIG", {}),
        help="Experiment config (JSON-formatted file)",
    )
    parser.add_argument(
        "--delete",
        type=json_file_arg,
        default=os.getenv("DET_DELETE", []),
        help="Checkpoints to delete (JSON-formatted file)",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        default=("DET_DRY_RUN" in os.environ),
        help="Do not actually delete any checkpoints from storage",
    )

    args = parser.parse_args(argv)

    logging.basicConfig(
        level=args.log_level,
        format="%(asctime)s:%(module)s:%(levelname)s: %(message)s")

    logging.info("Determined checkpoint GC, version {}".format(
        det.__version__))

    storage_config = args.experiment_config["checkpoint_storage"]
    logging.info("Using checkpoint storage: {}".format(storage_config))

    manager = storage.build(storage_config,
                            container_path=constants.SHARED_FS_CONTAINER_PATH)

    delete_checkpoints(manager,
                       args.delete["checkpoints"],
                       dry_run=args.dry_run)
Exemplo n.º 7
0
    def download(self, path: Optional[str] = None) -> str:
        """
        Download checkpoint from the checkpoint storage location locally.

        Arguments:
            path (string, optional): Top level directory to place the
                checkpoint under. If this parameter is not set the checkpoint will
                be downloaded to `checkpoints/<checkpoint_uuid>` relative to the
                current working directory.
        """
        if path is not None:
            local_ckpt_dir = pathlib.Path(path)
        else:
            local_ckpt_dir = pathlib.Path("checkpoints", self.uuid)

        # If the target directory doesn't already appear to contain a
        # checkpoint, attempt to fetch one.

        # We used MLflow's MLmodel checkpoint format in the past for
        # serializing pytorch models. We now use our own format that contains a
        # metadata.json file. We are checking for checkpoint existence by
        # looking for both checkpoint formats in the output directory.
        potential_metadata_paths = [
            local_ckpt_dir.joinpath(f) for f in ["metadata.json", "MLmodel"]
        ]
        if not any(p.exists() for p in potential_metadata_paths):
            if self.storage_config["type"] == "shared_fs":
                src_ckpt_dir = self._find_shared_fs_path()
                shutil.copytree(str(src_ckpt_dir), str(local_ckpt_dir))
            else:
                local_ckpt_dir.mkdir(parents=True, exist_ok=True)
                manager = storage.build(self.storage_config)
                if not isinstance(
                        manager,
                    (storage.S3StorageManager, storage.GCSStorageManager)):
                    raise AssertionError(
                        "Downloading from S3 or GCS requires the experiment to be configured with "
                        "S3 or GCS checkpointing, {} found instead".format(
                            self.storage_config["type"]))

                metadata = storage.StorageMetadata.from_json({
                    "uuid":
                    self.uuid,
                    "resources":
                    self.resources
                })
                manager.download(metadata, str(local_ckpt_dir))

        return str(local_ckpt_dir)
Exemplo n.º 8
0
def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None:
    fixtures = [
        (
            conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"),
            {
                "COMPLETED": {8, 9, 10},
                "DELETED": {1, 2, 3, 4, 5, 6, 7}
            },
        ),
        (
            conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"),
            {
                "COMPLETED": {1, 2, 3, 9, 10},
                "DELETED": {4, 5, 6, 7, 8}
            },
        ),
    ]

    all_checkpoints = []
    for base_conf_path, result in fixtures:
        config = conf.load_config(str(base_conf_path))
        config["checkpoint_storage"].update(checkpoint_storage)

        with tempfile.NamedTemporaryFile() as tf:
            with open(tf.name, "w") as f:
                yaml.dump(config, f)

            experiment_id = exp.create_experiment(tf.name,
                                                  conf.fixtures_path("no_op"))

        exp.wait_for_experiment_state(experiment_id, "COMPLETED")

        # Checkpoints are not marked as deleted until gc_checkpoint task starts.
        retries = 5
        for retry in range(retries):
            trials = exp.experiment_trials(experiment_id)
            assert len(trials) == 1

            checkpoints = sorted(
                (step["checkpoint"] for step in trials[0]["steps"]),
                key=operator.itemgetter("step_id"),
            )
            assert len(checkpoints) == 10
            by_state = {}  # type: Dict[str, Set[int]]
            for checkpoint in checkpoints:
                by_state.setdefault(checkpoint["state"],
                                    set()).add(checkpoint["step_id"])

            if by_state == result:
                all_checkpoints.append((config, checkpoints))
                break

            if retry + 1 == retries:
                assert by_state == result

            time.sleep(1)

    # Check that the actual checkpoint storage (for shared_fs) reflects the
    # deletions. We want to wait for the GC containers to exit, so check
    # repeatedly with a timeout.
    max_checks = 30
    for i in range(max_checks):
        time.sleep(1)
        try:
            for config, checkpoints in all_checkpoints:
                checkpoint_config = config["checkpoint_storage"]

                if checkpoint_config["type"] == "shared_fs":
                    deleted_exception = check.CheckFailedError
                elif checkpoint_config["type"] == "s3":
                    deleted_exception = botocore.exceptions.ClientError
                else:
                    raise NotImplementedError(
                        f'unsupported storage type {checkpoint_config["type"]}'
                    )

                storage_manager = storage.build(checkpoint_config,
                                                container_path=None)
                for checkpoint in checkpoints:
                    metadata = storage.StorageMetadata.from_json(checkpoint)
                    if checkpoint["state"] == "COMPLETED":
                        with storage_manager.restore_path(metadata):
                            pass
                    elif checkpoint["state"] == "DELETED":
                        try:
                            with storage_manager.restore_path(metadata):
                                raise AssertionError("checkpoint not deleted")
                        except deleted_exception:
                            pass
        except AssertionError:
            if i == max_checks - 1:
                raise
        else:
            break
Exemplo n.º 9
0
def test_unknown_type() -> None:
    config = {"type": "unknown"}
    with pytest.raises(TypeError, match="Unknown storage type: unknown"):
        storage.build(config)
Exemplo n.º 10
0
def test_build_with_container_path() -> None:
    config = {"type": "shared_fs", "host_path": "/host_path", "storage_path": "storage_path"}
    manager = storage.build(config)
    assert manager._base_path == "/host_path/storage_path"
    manager = storage.build(config, container_path="/container_path")
    assert manager._base_path == "/container_path/storage_path"
Exemplo n.º 11
0
def test_illegal_type() -> None:
    config = {"type": 4}
    with pytest.raises(CheckFailedError, match="must be a string"):
        storage.build(config)
Exemplo n.º 12
0
def test_missing_type() -> None:
    with pytest.raises(CheckFailedError, match="Missing 'type' parameter"):
        storage.build({})
Exemplo n.º 13
0
    def download(self, path: Optional[str] = None) -> str:
        """
        Download checkpoint to local storage.

        Arguments:
            path (string, optional): Top level directory to place the
                checkpoint under. If this parameter is not set, the checkpoint will
                be downloaded to ``checkpoints/<checkpoint_uuid>`` relative to the
                current working directory.
        """
        if path is not None:
            local_ckpt_dir = pathlib.Path(path)
        else:
            local_ckpt_dir = pathlib.Path("checkpoints", self.uuid)

        # Backward compatibility: we used MLflow's MLmodel checkpoint format for
        # serializing pytorch models. We now use our own format that contains a
        # metadata.json file. We are checking for checkpoint existence by
        # looking for both checkpoint formats in the output directory.
        potential_metadata_paths = [
            local_ckpt_dir.joinpath(f) for f in ["metadata.json", "MLmodel"]
        ]
        if not any(p.exists() for p in potential_metadata_paths):
            # If the target directory doesn't already appear to contain a
            # checkpoint, attempt to fetch one.
            if self.experiment_config["checkpoint_storage"][
                    "type"] == "shared_fs":
                src_ckpt_dir = self._find_shared_fs_path()
                shutil.copytree(str(src_ckpt_dir), str(local_ckpt_dir))
            else:
                local_ckpt_dir.mkdir(parents=True, exist_ok=True)
                manager = storage.build(
                    self.experiment_config["checkpoint_storage"],
                    container_path=None,
                )
                if not isinstance(
                        manager,
                    (storage.S3StorageManager, storage.GCSStorageManager)):
                    raise AssertionError(
                        "Downloading from S3 or GCS requires the experiment to be configured with "
                        "S3 or GCS checkpointing, {} found instead".format(
                            self.experiment_config["checkpoint_storage"]
                            ["type"]))

                metadata = storage.StorageMetadata.from_json({
                    "uuid":
                    self.uuid,
                    "resources":
                    self.resources
                })
                manager.download(metadata, str(local_ckpt_dir))

        if not local_ckpt_dir.joinpath("metadata.json").exists():
            with open(local_ckpt_dir.joinpath("metadata.json"), "w") as f:
                json.dump(
                    {
                        "determined_version": self.determined_version,
                        "framework": self.framework,
                        "format": self.format,
                        "experiment_id": self.experiment_id,
                        "trial_id": self.trial_id,
                        "hparams": self.hparams,
                        "experiment_config": self.experiment_config,
                        "metadata": self.metadata,
                    },
                    f,
                    indent=2,
                )

        return str(local_ckpt_dir)
Exemplo n.º 14
0
def test_missing_required_variable() -> None:
    config = {"type": "noop", "base_path": "test"}
    with pytest.raises(TypeError,
                       match="missing 1 required positional "
                       "argument: 'required'"):
        storage.build(config)
Exemplo n.º 15
0
def test_getting_manager_instance() -> None:
    config = {"type": "noop", "base_path": "test", "required": "value"}
    manager = storage.build(config)
    assert isinstance(manager, NoopStorageManager)
    assert manager.required == "value"
    assert manager.optional == "default"
Exemplo n.º 16
0
def list(args: Namespace) -> None:
    q = api.GraphQLQuery(args.master)
    q.op.experiments_by_pk(id=args.experiment_id).config(path="checkpoint_storage")

    order_by = [
        gql.checkpoints_order_by(
            validation=gql.validations_order_by(
                metric_values=gql.validation_metrics_order_by(signed=gql.order_by.asc)
            )
        )
    ]

    limit = None
    if args.best is not None:
        if args.best < 0:
            raise AssertionError("--best must be a non-negative integer")
        limit = args.best

    checkpoints = q.op.checkpoints(
        where=gql.checkpoints_bool_exp(
            step=gql.steps_bool_exp(
                trial=gql.trials_bool_exp(
                    experiment_id=gql.Int_comparison_exp(_eq=args.experiment_id)
                )
            )
        ),
        order_by=order_by,
        limit=limit,
    )
    checkpoints.end_time()
    checkpoints.labels()
    checkpoints.resources()
    checkpoints.start_time()
    checkpoints.state()
    checkpoints.step_id()
    checkpoints.trial_id()
    checkpoints.uuid()

    checkpoints.step.validation.metric_values.raw()

    resp = q.send()
    config = resp.experiments_by_pk.config

    headers = ["Trial ID", "Step ID", "State", "Validation Metric", "UUID", "Resources", "Size"]
    values = [
        [
            c.trial_id,
            c.step_id,
            c.state,
            c.step.validation.metric_values.raw
            if c.step.validation and c.step.validation.metric_values
            else None,
            c.uuid,
            render.format_resources(c.resources),
            render.format_resource_sizes(c.resources),
        ]
        for c in resp.checkpoints
    ]

    render.tabulate_or_csv(headers, values, args.csv)

    if args.download_dir is not None:
        manager = storage.build(config)
        if not (
            isinstance(manager, storage.S3StorageManager)
            or isinstance(manager, storage.GCSStorageManager)
        ):
            print(
                "Downloading from S3 or GCS requires the experiment to be configured with "
                "S3 or GCS checkpointing, {} found instead".format(config["type"])
            )
            sys.exit(1)

        for checkpoint in resp.checkpoints:
            metadata = storage.StorageMetadata.from_json(checkpoint.__to_json_value__())
            ckpt_dir = args.download_dir.joinpath(
                "exp-{}-trial-{}-step-{}".format(
                    args.experiment_id, checkpoint.trial_id, checkpoint.step_id
                )
            )
            print("Downloading checkpoint {} to {}".format(checkpoint.uuid, ckpt_dir))
            manager.download(metadata, ckpt_dir)