예제 #1
0
    def _init_paths(self) -> None:
        """
        Create a unique model directory for each training process. If
        a load path is provided, copy the checkpoint into the model
        directory of each training process. This model directory will
        be used to initialize an Estimator. We also update the paths in
        the CheckpointState metadata file to the new directory location.
        """
        # Add suffix so that horovod processes don't overwrite each other.
        suffix = str(0) if not self.hvd_config.use else str(hvd.local_rank())
        if self.load_path is None:
            self.estimator_dir = pathlib.Path(tempfile.mkdtemp(suffix=suffix))
            logging.debug(f"Estimator directory set to {self.estimator_dir}.")
            return

        for callback in self.train_hooks:
            if isinstance(callback, estimator.RunHook):
                callback.on_checkpoint_load(str(self.load_path))

        self.estimator_dir = pathlib.Path(tempfile.mkdtemp(suffix=suffix))
        if self.estimator_dir.exists():
            shutil.rmtree(str(self.estimator_dir))
        logging.debug(
            f"Copying from {self.load_path} to {self.estimator_dir}.")
        shutil.copytree(str(self.load_path), str(self.estimator_dir))

        # Calibrate the CheckpointState metadata file to the new location.
        estimator._update_checkpoint_path_in_state_file(self.estimator_dir)
        logging.debug(f"Load path set to {self.estimator_dir}.")
예제 #2
0
    def _copy_latest_checkpoint(self, checkpoint_path: pathlib.Path) -> None:
        checkpoint_dir = os.path.dirname(
            self.estimator_trial_controller.estimator.latest_checkpoint())
        shutil.copytree(checkpoint_dir, str(checkpoint_path))

        # Calibrate the CheckpointState metadata file to the new location.
        estimator._update_checkpoint_path_in_state_file(checkpoint_path)
예제 #3
0
    def _copy_latest_checkpoint(self, checkpoint_path: pathlib.Path) -> None:
        checkpoint_dir = os.path.dirname(
            self.estimator_trial_controller.estimator.latest_checkpoint())
        # shuil.copytree doesn't like to copy into a directory, even an empty one.
        checkpoint_path.rmdir()
        shutil.copytree(checkpoint_dir, str(checkpoint_path))

        # Calibrate the CheckpointState metadata file to the new location.
        estimator._update_checkpoint_path_in_state_file(checkpoint_path)
예제 #4
0
    def _init_paths(self) -> None:
        """
        Create a unique model directory for each training process. If
        a load path is provided, copy the checkpoint into the model
        directory of each training process. This model directory will
        be used to initialize an Estimator. We also update the paths in
        the CheckpointState metadata file to the new directory location.
        """

        # Add suffix so that horovod processes don't overwrite each other.
        suffix = str(self.context.distributed.local_rank)

        if self.env.latest_checkpoint is None:
            self.estimator_dir = pathlib.Path(tempfile.mkdtemp(suffix=suffix))
            logging.debug(f"Estimator directory set to {self.estimator_dir}.")
            return

        logging.info(
            f"Restoring trial from checkpoint {self.env.latest_checkpoint}")
        with self.context._core.checkpoint.restore_path(
                self.env.latest_checkpoint) as load_path:
            for callback in self.train_hooks:
                if isinstance(callback, estimator.RunHook):
                    callback.on_checkpoint_load(str(load_path))

            self.estimator_dir = pathlib.Path(tempfile.mkdtemp(suffix=suffix))
            if self.estimator_dir.exists():
                shutil.rmtree(str(self.estimator_dir))
            logging.debug(f"Copying from {load_path} to {self.estimator_dir}.")
            shutil.copytree(str(load_path), str(self.estimator_dir))

            # Calibrate the CheckpointState metadata file to the new location.
            estimator._update_checkpoint_path_in_state_file(self.estimator_dir)
            logging.debug(f"Load path set to {self.estimator_dir}.")

            # Load WorkloadSequencer state.
            wlsq_path = load_path / "workload_sequencer.pkl"
            if self.wlsq is not None and wlsq_path.exists():
                with wlsq_path.open("rb") as f:
                    self.wlsq.load_state(pickle.load(f))