def test_preprocessor_in_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) class DummyPreprocessor(Preprocessor): def __init__(self): super().__init__() self.is_same = True def fit(self, dataset): self.fitted_ = True def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame": return df trainer = LightGBMTrainer( scaling_config=scale_config, label_column="target", params=params, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, preprocessor=DummyPreprocessor(), ) result = trainer.fit() # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) model, preprocessor = load_checkpoint(resume_from) assert get_num_trees(model) == 10 assert preprocessor.is_same assert preprocessor.fitted_
def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) trainer = LightGBMTrainer( scaling_config=scale_config, label_column="target", params=params, num_boost_round=5, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, ) result = trainer.fit() checkpoint = result.checkpoint model, _ = load_checkpoint(checkpoint) assert get_num_trees(model) == 5 # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) trainer = LightGBMTrainer( scaling_config=scale_config, label_column="target", params=params, num_boost_round=5, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, resume_from_checkpoint=resume_from, ) result = trainer.fit() checkpoint = result.checkpoint xgb_model, _ = load_checkpoint(checkpoint) assert get_num_trees(xgb_model) == 10
def testLocalCheckpointSerde(self): # Local checkpoints are converted to bytes on serialization. Currently # this is a pickled dict, so we compare with a dict checkpoint. source_checkpoint = Checkpoint.from_dict({"checkpoint_data": 5}) with source_checkpoint.as_directory() as tmpdir: checkpoint = Checkpoint.from_directory(tmpdir) self._testCheckpointSerde( checkpoint, *source_checkpoint.get_internal_representation())
def _trial_to_result(self, trial: Trial) -> Result: result = Result( checkpoint=Checkpoint.from_directory(trial.checkpoint.value) if trial.checkpoint.value else None, metrics=trial.last_result, error=self._populate_exception(trial), ) return result
def _prepare_fs_checkpoint(self) -> Checkpoint: # Create checkpoint from fs checkpoint = Checkpoint.from_directory(self.checkpoint_dir) self.assertIsInstance(checkpoint, Checkpoint) self.assertTrue(checkpoint._local_path, str) self.assertEqual(checkpoint._local_path, self.checkpoint_dir) return checkpoint
def testLocalCheckpointSerde(self): # Local checkpoints are converted to bytes on serialization. Currently # this is a pickled dict, so we compare with a dict checkpoint. source_checkpoint = Checkpoint.from_dict({"checkpoint_data": 5}) tmpdir = source_checkpoint.to_directory() self.addCleanup(shutil.rmtree, tmpdir) checkpoint = Checkpoint.from_directory(tmpdir) self._testCheckpointSerde( checkpoint, *source_checkpoint.get_internal_representation())
def get_best_checkpoint( self, trial: Trial, metric: Optional[str] = None, mode: Optional[str] = None) -> Optional[Checkpoint]: """Gets best persistent checkpoint path of provided trial. Any checkpoints with an associated metric value of ``nan`` will be filtered out. Args: trial: The log directory of a trial, or a trial instance. metric: key of trial info to return, e.g. "mean_accuracy". "training_iteration" is used by default if no value was passed to ``self.default_metric``. mode: One of [min, max]. Defaults to ``self.default_mode``. Returns: :class:`Checkpoint <ray.ml.Checkpoint>` object. """ metric = metric or self.default_metric or TRAINING_ITERATION mode = self._validate_mode(mode) checkpoint_paths = self.get_trial_checkpoints_paths(trial, metric) # Filter out nan. Sorting nan values leads to undefined behavior. checkpoint_paths = [(path, metric) for path, metric in checkpoint_paths if not is_nan(metric)] if not checkpoint_paths: logger.error(f"No checkpoints have been found for trial {trial}.") return None a = -1 if mode == "max" else 1 best_path_metrics = sorted(checkpoint_paths, key=lambda x: a * x[1]) best_path, best_metric = best_path_metrics[0] cloud_path = self._parse_cloud_path(best_path) if self._legacy_checkpoint: return TrialCheckpoint(local_path=best_path, cloud_path=cloud_path) if cloud_path: # Prefer cloud path over local path for downsteam processing return Checkpoint.from_uri(cloud_path) elif os.path.exists(best_path): return Checkpoint.from_directory(best_path) else: logger.error( f"No checkpoint locations for {trial} available on " f"this node. To avoid this, you " f"should enable checkpoint synchronization with the" f"`sync_config` argument in Ray Tune. " f"The checkpoint may be available on a different node - " f"please check this location on worker nodes: {best_path}") return None
def train_func(config, checkpoint_dir=None): # config already contains merged values. # Instantiate new Trainer in Trainable. trainer = trainer_cls(**config) if checkpoint_dir: trainer.resume_from_checkpoint = Checkpoint.from_directory( checkpoint_dir) trainer.setup() trainer.preprocess_datasets() trainer.training_loop()
def test_predict_no_preprocessor(): with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f: cpickle.dump(model, f) checkpoint = Checkpoint.from_directory(tmpdir) predictor = SklearnPredictor.from_checkpoint(checkpoint) data_batch = np.array([[1, 2], [3, 4], [5, 6]]) predictions = predictor.predict(data_batch) assert len(predictions) == 3
def _trial_to_result(self, trial: Trial) -> Result: if trial.checkpoint.value: checkpoint_dir = TrainableUtil.find_checkpoint_dir(trial.checkpoint.value) checkpoint = Checkpoint.from_directory(checkpoint_dir) else: checkpoint = None result = Result( checkpoint=checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), ) return result
def test_dict_checkpoint_fs(self): """Test conversion from dict to FS checkpoint and back.""" checkpoint = self._prepare_dict_checkpoint() # Convert into fs checkpoint path = checkpoint.to_directory() self.assertIsInstance(path, str) # Create from path checkpoint = Checkpoint.from_directory(path) self.assertTrue(checkpoint._local_path) self._assert_dict_checkpoint(checkpoint)
def test_batch_prediction_with_set_cpus(ray_start_4_cpus): with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f: cpickle.dump(model, f) checkpoint = Checkpoint.from_directory(tmpdir) batch_predictor = BatchPredictor.from_checkpoint( checkpoint, SklearnPredictor) test_dataset = ray.data.from_pandas( pd.DataFrame(dummy_data, columns=["A", "B"])) batch_predictor.predict(test_dataset, num_cpus_per_worker=2, num_estimator_cpus=2)
def create_checkpoint(preprocessor: Optional[Preprocessor] = None, config: Optional[dict] = None) -> Checkpoint: rl_trainer = RLTrainer( algorithm=_DummyTrainer, config=config or {}, preprocessor=preprocessor, ) rl_trainable_cls = rl_trainer.as_trainable() rl_trainable = rl_trainable_cls() with tempfile.TemporaryDirectory() as checkpoint_dir: checkpoint_file = rl_trainable.save(checkpoint_dir) checkpoint_path = TrainableUtil.find_checkpoint_dir(checkpoint_file) checkpoint_data = Checkpoint.from_directory(checkpoint_path).to_dict() return Checkpoint.from_dict(checkpoint_data)
def test_predict_no_preprocessor(): with tempfile.TemporaryDirectory() as tmpdir: # This somewhat convoluted procedure is the same as in the # Trainers. The reason for saving model to disk instead # of directly to the dict as bytes is due to all callbacks # following save to disk logic. GBDT models are small # enough that IO should not be an issue. model.save_model(os.path.join(tmpdir, MODEL_KEY)) checkpoint = Checkpoint.from_directory(tmpdir) predictor = LightGBMPredictor.from_checkpoint(checkpoint) data_batch = np.array([[1, 2], [3, 4], [5, 6]]) predictions = predictor.predict(data_batch) assert len(predictions) == 3
def _maybe_save_to_cloud(self, checkpoint_dir: str): # Derived classes like the FunctionRunner might call this if self.uses_cloud_checkpointing: if self.storage_client: # Keep for backwards compatibility, remove after deprecation self.storage_client.sync_up(checkpoint_dir, self._storage_path(checkpoint_dir)) self.storage_client.wait_or_retry() return checkpoint = Checkpoint.from_directory(checkpoint_dir) retry_fn( lambda: checkpoint.to_uri(self._storage_path(checkpoint_dir)), subprocess.CalledProcessError, num_retries=3, sleep_time=1, )
def test_init(): preprocessor = DummyPreprocessor() preprocessor.attr = 1 predictor = SklearnPredictor(estimator=model, preprocessor=preprocessor) with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f: cpickle.dump(model, f) save_preprocessor_to_dir(preprocessor, tmpdir) checkpoint = Checkpoint.from_directory(tmpdir) checkpoint_predictor = SklearnPredictor.from_checkpoint(checkpoint) assert np.allclose( checkpoint_predictor.estimator.feature_importances_, predictor.estimator.feature_importances_, ) assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr
def test_init(): preprocessor = DummyPreprocessor() preprocessor.attr = 1 predictor = LightGBMPredictor(model=model, preprocessor=preprocessor) with tempfile.TemporaryDirectory() as tmpdir: # This somewhat convoluted procedure is the same as in the # Trainers. The reason for saving model to disk instead # of directly to the dict as bytes is due to all callbacks # following save to disk logic. GBDT models are small # enough that IO should not be an issue. model.save_model(os.path.join(tmpdir, MODEL_KEY)) checkpoint = Checkpoint.from_dict({PREPROCESSOR_KEY: preprocessor}) checkpoint.to_directory(path=tmpdir) checkpoint = Checkpoint.from_directory(tmpdir) checkpoint_predictor = LightGBMPredictor.from_checkpoint(checkpoint) assert get_num_trees(checkpoint_predictor.model) == get_num_trees(predictor.model) assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr
def test_preprocessor_in_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) class DummyPreprocessor(Preprocessor): def __init__(self): super().__init__() self.is_same = True def fit(self, dataset): self.fitted_ = True def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame": return df trainer = SklearnTrainer( estimator=RandomForestClassifier(), scaling_config=scale_config, label_column="target", datasets={ TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset }, preprocessor=DummyPreprocessor(), ) result = trainer.fit() # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) model, preprocessor = load_from_checkpoint(resume_from) assert hasattr(model, "feature_importances_") assert preprocessor.is_same assert preprocessor.fitted_
def _serialize_checkpoint(checkpoint_path) -> bytes: checkpoint = Checkpoint.from_directory(checkpoint_path) return checkpoint.to_bytes()