def write_to_disk(path: Path): # Get or create checkpoint dir. path.parent.mkdir(parents=True, exist_ok=True) # Write checkpoint to disk. with path.open("wb") as f: cloudpickle.dump(checkpoint, f) logger.debug(f"Checkpoint successfully written to: " f"{path}")
def _to_directory(self, path: str) -> None: if self._data_dict or self._obj_ref: # This is a object ref or dict data_dict = self.to_dict() if _FS_CHECKPOINT_KEY in data_dict: # This used to be a true fs checkpoint, so restore _unpack(data_dict[_FS_CHECKPOINT_KEY], path) else: # This is a dict checkpoint. Dump data into checkpoint.pkl checkpoint_data_path = os.path.join( path, _DICT_CHECKPOINT_FILE_NAME) with open(checkpoint_data_path, "wb") as f: pickle.dump(data_dict, f) else: # This is either a local fs, remote node fs, or external fs local_path = self._local_path external_path = _get_external_path(self._uri) if local_path: if local_path != path: # If this exists on the local path, just copy over if path and os.path.exists(path): shutil.rmtree(path) shutil.copytree(local_path, path) elif external_path: # If this exists on external storage (e.g. cloud), download download_from_uri(uri=external_path, local_path=path, filelock=False) else: raise RuntimeError( f"No valid location found for checkpoint {self}: {self._uri}" )
def process_checkpoint(checkpoint, parent_dir, trainable_state): saved_as_dict = False if isinstance(checkpoint, string_types): if not checkpoint.startswith(parent_dir): raise ValueError( "The returned checkpoint path must be within the " "given checkpoint dir {}: {}".format( parent_dir, checkpoint)) checkpoint_path = checkpoint if os.path.isdir(checkpoint_path): # Add trailing slash to prevent tune metadata from # being written outside the directory. checkpoint_path = os.path.join(checkpoint_path, "") elif isinstance(checkpoint, dict): saved_as_dict = True checkpoint_path = os.path.join(parent_dir, "checkpoint") with open(checkpoint_path, "wb") as f: pickle.dump(checkpoint, f) else: raise ValueError("Returned unexpected type {}. " "Expected str or dict.".format(type(checkpoint))) with open(checkpoint_path + ".tune_metadata", "wb") as f: trainable_state["saved_as_dict"] = saved_as_dict pickle.dump(trainable_state, f) return checkpoint_path
def _postprocess_checkpoint(self, checkpoint_path: str): preprocessor = self._merged_config.get("preprocessor", None) if not checkpoint_path or preprocessor is None: return with open(os.path.join(checkpoint_path, PREPROCESSOR_KEY), "wb") as f: cpickle.dump(preprocessor, f)
def __init__( self, restore_path: str = None, trainable: Optional[Union[str, Callable, Type[Trainable], BaseTrainer, ]] = None, param_space: Optional[Dict[str, Any]] = None, tune_config: Optional[TuneConfig] = None, run_config: Optional[RunConfig] = None, _tuner_kwargs: Optional[Dict] = None, ): # Restored from Tuner checkpoint. if restore_path: trainable_ckpt = os.path.join(restore_path, _TRAINABLE_PKL) with open(trainable_ckpt, "rb") as fp: trainable = pickle.load(fp) tuner_ckpt = os.path.join(restore_path, _TUNER_PKL) with open(tuner_ckpt, "rb") as fp: tuner = pickle.load(fp) self.__dict__.update(tuner.__dict__) self._is_restored = True self._trainable = trainable self._experiment_checkpoint_dir = restore_path return # Start from fresh if not trainable: raise TuneError("You need to provide a trainable to tune.") # If no run config was passed to Tuner directly, use the one from the Trainer, # if available if not run_config and isinstance(trainable, BaseTrainer): run_config = trainable.run_config self._is_restored = False self._trainable = trainable self._tune_config = tune_config or TuneConfig() self._run_config = run_config or RunConfig() self._tuner_kwargs = copy.deepcopy(_tuner_kwargs) or {} self._experiment_checkpoint_dir = self._setup_create_experiment_checkpoint_dir( self._run_config) # Not used for restored Tuner. self._param_space = param_space or {} # This needs to happen before `tune.run()` is kicked in. # This is because currently tune does not exit gracefully if # run in ray client mode - if crash happens, it just exits immediately # without allowing for checkpointing tuner and trainable. # Thus this has to happen before tune.run() so that we can have something # to restore from. tuner_ckpt = os.path.join(self._experiment_checkpoint_dir, _TUNER_PKL) with open(tuner_ckpt, "wb") as fp: pickle.dump(self, fp) trainable_ckpt = os.path.join(self._experiment_checkpoint_dir, _TRAINABLE_PKL) with open(trainable_ckpt, "wb") as fp: pickle.dump(self._trainable, fp)
def save_preprocessor_to_dir( preprocessor: "Preprocessor", parent_dir: Union[os.PathLike, str], ) -> None: """Save preprocessor to file. Returns path saved to.""" parent_dir = Path(parent_dir) with open(parent_dir.joinpath(PREPROCESSOR_KEY), "wb") as f: cpickle.dump(preprocessor, f)
def update_config(self, config): self.config = config config_out = os.path.join(self.logdir, "params.json") with open(config_out, "w") as f: json.dump(self.config, f, cls=_SafeFallbackEncoder) config_pkl = os.path.join(self.logdir, "params.pkl") with open(config_pkl, "wb") as f: cloudpickle.dump(self.config, f)
def update_config(self, config: Dict): self.config = config config_out = os.path.join(self.logdir, EXPR_PARAM_FILE) with open(config_out, "w") as f: json.dump(self.config, f, indent=2, sort_keys=True, cls=SafeFallbackEncoder) config_pkl = os.path.join(self.logdir, EXPR_PARAM_PICKLE_FILE) with open(config_pkl, "wb") as f: cloudpickle.dump(self.config, f)
def save(self, checkpoint_path: str): if self._random_state_seed is not None: numpy_random_state = np.random.get_state() else: numpy_random_state = None save_object = self.__dict__ save_object["_random_state_seed_to_set"] = numpy_random_state with open(checkpoint_path, "wb") as outputFile: cloudpickle.dump(save_object, outputFile)
def save_checkpoint(self, tmp_checkpoint_dir: str = ""): checkpoint_path = super().save_checkpoint() parent_dir = TrainableUtil.find_checkpoint_dir(checkpoint_path) preprocessor = self._merged_config.get("preprocessor", None) if parent_dir and preprocessor: with open(os.path.join(parent_dir, PREPROCESSOR_KEY), "wb") as f: cpickle.dump(preprocessor, f) return checkpoint_path
def write_checkpoint(self, checkpoint: Dict): self.add_tune_checkpoint_id(checkpoint) # If inside a Tune Trainable, then checkpoint with Tune. with tune.checkpoint_dir(step=self._latest_checkpoint_id) as checkpoint_dir: path = Path(checkpoint_dir) # Use a standard file name so that we know which file to load # the checkpoint from. file_path = path.joinpath(TUNE_CHECKPOINT_FILE_NAME) with file_path.open("wb") as f: cloudpickle.dump(checkpoint, f)
def _atomic_save(state: Dict, checkpoint_dir: str, file_name: str): """Atomically saves the object to the checkpoint directory This is automatically used by tune.run during a Tune job. """ tmp_search_ckpt_path = os.path.join(checkpoint_dir, ".tmp_search_generator_ckpt") with open(tmp_search_ckpt_path, "wb") as f: cloudpickle.dump(state, f) os.rename(tmp_search_ckpt_path, os.path.join(checkpoint_dir, file_name))
def test_predict_no_preprocessor(): with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f: cpickle.dump(model, f) checkpoint = Checkpoint.from_directory(tmpdir) predictor = SklearnPredictor.from_checkpoint(checkpoint) data_batch = np.array([[1, 2], [3, 4], [5, 6]]) predictions = predictor.predict(data_batch) assert len(predictions) == 3
def __exit__(self, type, value, traceback): if self._shelf: # Close the shelf file, and store the number of episodes for ease self._shelf["num_episodes"] = self._num_episodes self._shelf.close() elif self._outfile and not self._use_shelve: # Dump everything as one big pickle: cloudpickle.dump(self._rollouts, open(self._outfile, "wb")) if self._update_file: # Remove the temp progress file: self._get_tmp_progress_filename().unlink() self._update_file = None
def update_config(self, config): self.config = config config_out = os.path.join(self.logdir, "params.json") with open(config_out, "w") as f: json.dump(self.config, f, indent=2, sort_keys=True, cls=tune_logger._SafeFallbackEncoder) config_pkl = os.path.join(self.logdir, "params.pkl") with open(config_pkl, "wb") as f: cloudpickle.dump(self.config, f)
def write_checkpoint(self, checkpoint: Dict): # Store the checkpoint_id in the file so that the Tune trial can be # resumed after failure or cancellation. checkpoint[TUNE_CHECKPOINT_ID] = self._latest_checkpoint_id # If inside a Tune Trainable, then checkpoint with Tune. with tune.checkpoint_dir(step=self._latest_checkpoint_id) as \ checkpoint_dir: path = Path(checkpoint_dir) # Use a standard file name so that we know which file to load # the checkpoint from. file_path = path.joinpath(TUNE_CHECKPOINT_FILE_NAME) with file_path.open("wb") as f: cloudpickle.dump(checkpoint, f)
def _init(self): config_out = os.path.join(self.logdir, "params.json") with open(config_out, "w") as f: json.dump(self.config, f, indent=2, sort_keys=True, cls=_SafeFallbackEncoder) config_pkl = os.path.join(self.logdir, "params.pkl") with open(config_pkl, "wb") as f: cloudpickle.dump(self.config, f) local_file = os.path.join(self.logdir, "result.json") self.local_out = open(local_file, "a")
def write_checkpoint(self, checkpoint: Dict): """Writes checkpoint to disk.""" if self._checkpoint_strategy.num_to_keep == 0: # Checkpoints should not be persisted to disk. return # TODO(matt): Implement additional checkpoint strategy functionality. # Get or create checkpoint dir. self.latest_checkpoint_dir.mkdir(parents=True, exist_ok=True) # Write checkpoint to disk. with self.latest_checkpoint_path.open("wb") as f: cloudpickle.dump(checkpoint, f) logger.debug(f"Checkpoint successfully written to: " f"{self.latest_checkpoint_path}")
def _init(self): config_out = os.path.join(self.logdir, "params.json") with open(config_out, "w") as f: json.dump( self.config, f, indent=2, sort_keys=True, cls=_SafeFallbackEncoder) config_pkl = os.path.join(self.logdir, "params.pkl") with open(config_pkl, "wb") as f: cloudpickle.dump(self.config, f) local_file = os.path.join(self.logdir, "result.json") self.local_out = open(local_file, "a")
def test_batch_prediction_with_set_cpus(ray_start_4_cpus): with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f: cpickle.dump(model, f) checkpoint = Checkpoint.from_directory(tmpdir) batch_predictor = BatchPredictor.from_checkpoint( checkpoint, SklearnPredictor) test_dataset = ray.data.from_pandas( pd.DataFrame(dummy_data, columns=["A", "B"])) batch_predictor.predict(test_dataset, num_cpus_per_worker=2, num_estimator_cpus=2)
def write_error_log(self, exc: Optional[Union[TuneError, RayTaskError]] = None): if exc and self.logdir: self.num_failures += 1 self.error_file = os.path.join(self.logdir, "error.txt") if exc and isinstance(exc, RayTaskError): # Piping through the actual error to result grid. self.pickled_error_file = os.path.join(self.logdir, "error.pkl") with open(self.pickled_error_file, "wb") as f: cloudpickle.dump(exc, f) with open(self.error_file, "a+") as f: f.write("Failure # {} (occurred at {})\n".format( self.num_failures, date_str())) f.write(str(exc) + "\n") self.invalidate_json_state()
def to_directory(self, path: Optional[str] = None) -> str: """Write checkpoint data to directory. Args: path (str): Target directory to restore data in. Returns: str: Directory containing checkpoint data. """ path = path if path is not None else _temporary_checkpoint_dir() os.makedirs(path, exist_ok=True) # Drop marker open(os.path.join(path, ".is_checkpoint"), "a").close() if self._data_dict or self._obj_ref: # This is a object ref or dict data_dict = self.to_dict() if _FS_CHECKPOINT_KEY in data_dict: # This used to be a true fs checkpoint, so restore _unpack(data_dict[_FS_CHECKPOINT_KEY], path) else: # This is a dict checkpoint. Dump data into checkpoint.pkl checkpoint_data_path = os.path.join( path, _DICT_CHECKPOINT_FILE_NAME) with open(checkpoint_data_path, "wb") as f: pickle.dump(data_dict, f) else: # This is either a local fs, remote node fs, or external fs local_path = self._local_path external_path = _get_external_path(self._uri) if local_path: if local_path != path: # If this exists on the local path, just copy over if path and os.path.exists(path): shutil.rmtree(path) shutil.copytree(local_path, path) elif external_path: # If this exists on external storage (e.g. cloud), download download_from_bucket(bucket=external_path, local_path=path) else: raise RuntimeError( f"No valid location found for checkpoint {self}: {self._uri}" ) return path
def save_checkpoint(self, checkpoint_dir: str): checkpoint_path = super(AIRRLTrainer, self).save_checkpoint(checkpoint_dir) trainer_class_path = os.path.join(checkpoint_dir, RL_TRAINER_CLASS_FILE) with open(trainer_class_path, "wb") as fp: cpickle.dump(self.__class__, fp) config_path = os.path.join(checkpoint_dir, RL_CONFIG_FILE) with open(config_path, "wb") as fp: cpickle.dump(self.config, fp) if preprocessor: save_preprocessor_to_dir(preprocessor, checkpoint_dir) return checkpoint_path
def _to_directory(self, path: str) -> None: if self._data_dict or self._obj_ref: # This is a object ref or dict data_dict = self.to_dict() if _FS_CHECKPOINT_KEY in data_dict: for key in data_dict.keys(): if key == _FS_CHECKPOINT_KEY: continue metadata_path = os.path.join( path, f"{key}{_METADATA_CHECKPOINT_SUFFIX}") with open(metadata_path, "wb") as f: pickle.dump(data_dict[key], f) # This used to be a true fs checkpoint, so restore _unpack(data_dict[_FS_CHECKPOINT_KEY], path) else: # This is a dict checkpoint. # First, restore any additional files additional_files = data_dict.pop( _DICT_CHECKPOINT_ADDITIONAL_FILE_KEY, {}) for file, content in additional_files.items(): _unpack(stream=content, path=os.path.join(path, file)) # Then dump data into checkpoint.pkl checkpoint_data_path = os.path.join( path, _DICT_CHECKPOINT_FILE_NAME) with open(checkpoint_data_path, "wb") as f: pickle.dump(data_dict, f) else: # This is either a local fs, remote node fs, or external fs local_path = self._local_path external_path = _get_external_path(self._uri) if local_path: if local_path != path: # If this exists on the local path, just copy over if path and os.path.exists(path): shutil.rmtree(path) shutil.copytree(local_path, path) elif external_path: # If this exists on external storage (e.g. cloud), download download_from_uri(uri=external_path, local_path=path, filelock=False) else: raise RuntimeError( f"No valid location found for checkpoint {self}: {self._uri}" )
def _save(self, checkpoint_dir): """Creates a checkpoint in ``checkpoint_dir``, creating a pickle file. Args: checkpoint_dir (str): file path to store pickle checkpoint. Returns: path (str): file path to the pickled checkpoint file. """ path = os.path.join(checkpoint_dir, "checkpoint") try: with open(path, "wb") as f: cpickle.dump(self.estimator_list, f) except Exception: warnings.warn("Unable to save estimator.", category=RuntimeWarning) return path
def test_init(): preprocessor = DummyPreprocessor() preprocessor.attr = 1 predictor = SklearnPredictor(estimator=model, preprocessor=preprocessor) with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f: cpickle.dump(model, f) save_preprocessor_to_dir(preprocessor, tmpdir) checkpoint = Checkpoint.from_directory(tmpdir) checkpoint_predictor = SklearnPredictor.from_checkpoint(checkpoint) assert np.allclose( checkpoint_predictor.estimator.feature_importances_, predictor.estimator.feature_importances_, ) assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr
def atomic_save(state: Dict, checkpoint_dir: str, file_name: str, tmp_file_name: str): """Atomically saves the state object to the checkpoint directory. This is automatically used by tune.run during a Tune job. Args: state (dict): Object state to be serialized. checkpoint_dir (str): Directory location for the checkpoint. file_name (str): Final name of file. tmp_file_name (str): Temporary name of file. """ import ray.cloudpickle as cloudpickle tmp_search_ckpt_path = os.path.join(checkpoint_dir, tmp_file_name) with open(tmp_search_ckpt_path, "wb") as f: cloudpickle.dump(state, f) os.replace(tmp_search_ckpt_path, os.path.join(checkpoint_dir, file_name))
def process_checkpoint( checkpoint: Union[Dict, str], parent_dir: str, trainable_state: Dict ) -> str: """Creates checkpoint file structure and writes metadata under `parent_dir`. The file structure could either look like: - checkpoint_00000 (returned path) -- .is_checkpoint -- .tune_metadata -- xxx.pkl (or whatever user specifies in their Trainable) Or, - checkpoint_00000 -- .is_checkpoint -- checkpoint (returned path) -- checkpoint.tune_metadata """ saved_as_dict = False if isinstance(checkpoint, string_types): if not checkpoint.startswith(parent_dir): raise ValueError( "The returned checkpoint path must be within the " "given checkpoint dir {}: {}".format(parent_dir, checkpoint) ) checkpoint_path = checkpoint if os.path.isdir(checkpoint_path): # Add trailing slash to prevent tune metadata from # being written outside the directory. checkpoint_path = os.path.join(checkpoint_path, "") elif isinstance(checkpoint, dict): saved_as_dict = True checkpoint_path = os.path.join(parent_dir, "checkpoint") with open(checkpoint_path, "wb") as f: pickle.dump(checkpoint, f) else: raise ValueError( "Returned unexpected type {}. " "Expected str or dict.".format(type(checkpoint)) ) with open(checkpoint_path + ".tune_metadata", "wb") as f: trainable_state["saved_as_dict"] = saved_as_dict pickle.dump(trainable_state, f) return checkpoint_path
def from_estimator( cls, estimator: BaseEstimator, *, path: os.PathLike, preprocessor: Optional["Preprocessor"] = None, ) -> "SklearnCheckpoint": """Create a :py:class:`~ray.air.checkpoint.Checkpoint` that stores an sklearn ``Estimator``. Args: estimator: The ``Estimator`` to store in the checkpoint. path: The directory where the checkpoint will be stored. preprocessor: A fitted preprocessor to be applied before inference. Returns: An :py:class:`SklearnCheckpoint` containing the specified ``Estimator``. Examples: >>> from ray.train.sklearn import SklearnCheckpoint >>> from sklearn.ensemble import RandomForestClassifier >>> >>> estimator = RandomForestClassifier() >>> checkpoint = SklearnCheckpoint.from_estimator(estimator, path=".") You can use a :py:class:`SklearnCheckpoint` to create an :py:class:`~ray.train.sklearn.SklearnPredictor` and preform inference. >>> from ray.train.sklearn import SklearnPredictor >>> >>> predictor = SklearnPredictor.from_checkpoint(checkpoint) """ with open(os.path.join(path, MODEL_KEY), "wb") as f: cpickle.dump(estimator, f) if preprocessor: save_preprocessor_to_dir(preprocessor, path) checkpoint = cls.from_directory(path) return checkpoint
def _save(self, checkpoint_dir): """Creates a checkpoint in ``checkpoint_dir``, creating a pickle file. Args: checkpoint_dir (str): file path to store pickle checkpoint. Returns: path (str): file path to the pickled checkpoint file. """ path = os.path.join(checkpoint_dir, "checkpoint") with open(path, "wb") as f: try: cpickle.dump(self.estimator, f) self.pickled = True except PicklingError: self.pickled = False warnings.warn("{} could not be pickled. " "Restoring estimators may run into issues." .format(self.estimator)) return path
def to_air_checkpoint( path: str, estimator: BaseEstimator, preprocessor: Optional["Preprocessor"] = None, ) -> Checkpoint: """Convert a pretrained model to AIR checkpoint for serve or inference. Args: path: The directory path where model and preprocessor steps are stored to. estimator: A pretrained model. preprocessor: A fitted preprocessor. The preprocessing logic will be applied to serve/inference. Returns: A Ray Air checkpoint. """ with open(os.path.join(path, MODEL_KEY), "wb") as f: cpickle.dump(estimator, f) if preprocessor: save_preprocessor_to_dir(preprocessor, path) checkpoint = Checkpoint.from_directory(path) return checkpoint