Пример #1
0
def test_preprocessor_in_checkpoint(ray_start_4_cpus, tmpdir):
    train_dataset = ray.data.from_pandas(train_df)
    valid_dataset = ray.data.from_pandas(test_df)

    class DummyPreprocessor(Preprocessor):
        def __init__(self):
            super().__init__()
            self.is_same = True

        def fit(self, dataset):
            self.fitted_ = True

        def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame":
            return df

    trainer = LightGBMTrainer(
        scaling_config=scale_config,
        label_column="target",
        params=params,
        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
        preprocessor=DummyPreprocessor(),
    )
    result = trainer.fit()

    # Move checkpoint to a different directory.
    checkpoint_dict = result.checkpoint.to_dict()
    checkpoint = Checkpoint.from_dict(checkpoint_dict)
    checkpoint_path = checkpoint.to_directory(tmpdir)
    resume_from = Checkpoint.from_directory(checkpoint_path)

    model, preprocessor = load_checkpoint(resume_from)
    assert get_num_trees(model) == 10
    assert preprocessor.is_same
    assert preprocessor.fitted_
Пример #2
0
def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir):
    train_dataset = ray.data.from_pandas(train_df)
    valid_dataset = ray.data.from_pandas(test_df)
    trainer = LightGBMTrainer(
        scaling_config=scale_config,
        label_column="target",
        params=params,
        num_boost_round=5,
        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
    )
    result = trainer.fit()
    checkpoint = result.checkpoint
    model, _ = load_checkpoint(checkpoint)
    assert get_num_trees(model) == 5

    # Move checkpoint to a different directory.
    checkpoint_dict = result.checkpoint.to_dict()
    checkpoint = Checkpoint.from_dict(checkpoint_dict)
    checkpoint_path = checkpoint.to_directory(tmpdir)
    resume_from = Checkpoint.from_directory(checkpoint_path)

    trainer = LightGBMTrainer(
        scaling_config=scale_config,
        label_column="target",
        params=params,
        num_boost_round=5,
        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
        resume_from_checkpoint=resume_from,
    )
    result = trainer.fit()
    checkpoint = result.checkpoint
    xgb_model, _ = load_checkpoint(checkpoint)
    assert get_num_trees(xgb_model) == 10
Пример #3
0
 def testLocalCheckpointSerde(self):
     # Local checkpoints are converted to bytes on serialization. Currently
     # this is a pickled dict, so we compare with a dict checkpoint.
     source_checkpoint = Checkpoint.from_dict({"checkpoint_data": 5})
     with source_checkpoint.as_directory() as tmpdir:
         checkpoint = Checkpoint.from_directory(tmpdir)
         self._testCheckpointSerde(
             checkpoint, *source_checkpoint.get_internal_representation())
Пример #4
0
 def _trial_to_result(self, trial: Trial) -> Result:
     result = Result(
         checkpoint=Checkpoint.from_directory(trial.checkpoint.value)
         if trial.checkpoint.value else None,
         metrics=trial.last_result,
         error=self._populate_exception(trial),
     )
     return result
Пример #5
0
    def _prepare_fs_checkpoint(self) -> Checkpoint:
        # Create checkpoint from fs
        checkpoint = Checkpoint.from_directory(self.checkpoint_dir)

        self.assertIsInstance(checkpoint, Checkpoint)
        self.assertTrue(checkpoint._local_path, str)
        self.assertEqual(checkpoint._local_path, self.checkpoint_dir)

        return checkpoint
Пример #6
0
    def testLocalCheckpointSerde(self):
        # Local checkpoints are converted to bytes on serialization. Currently
        # this is a pickled dict, so we compare with a dict checkpoint.
        source_checkpoint = Checkpoint.from_dict({"checkpoint_data": 5})
        tmpdir = source_checkpoint.to_directory()
        self.addCleanup(shutil.rmtree, tmpdir)

        checkpoint = Checkpoint.from_directory(tmpdir)
        self._testCheckpointSerde(
            checkpoint, *source_checkpoint.get_internal_representation())
Пример #7
0
    def get_best_checkpoint(
            self,
            trial: Trial,
            metric: Optional[str] = None,
            mode: Optional[str] = None) -> Optional[Checkpoint]:
        """Gets best persistent checkpoint path of provided trial.

        Any checkpoints with an associated metric value of ``nan`` will be filtered out.

        Args:
            trial: The log directory of a trial, or a trial instance.
            metric: key of trial info to return, e.g. "mean_accuracy".
                "training_iteration" is used by default if no value was
                passed to ``self.default_metric``.
            mode: One of [min, max]. Defaults to ``self.default_mode``.

        Returns:
            :class:`Checkpoint <ray.ml.Checkpoint>` object.
        """
        metric = metric or self.default_metric or TRAINING_ITERATION
        mode = self._validate_mode(mode)

        checkpoint_paths = self.get_trial_checkpoints_paths(trial, metric)

        # Filter out nan. Sorting nan values leads to undefined behavior.
        checkpoint_paths = [(path, metric) for path, metric in checkpoint_paths
                            if not is_nan(metric)]

        if not checkpoint_paths:
            logger.error(f"No checkpoints have been found for trial {trial}.")
            return None

        a = -1 if mode == "max" else 1
        best_path_metrics = sorted(checkpoint_paths, key=lambda x: a * x[1])

        best_path, best_metric = best_path_metrics[0]
        cloud_path = self._parse_cloud_path(best_path)

        if self._legacy_checkpoint:
            return TrialCheckpoint(local_path=best_path, cloud_path=cloud_path)

        if cloud_path:
            # Prefer cloud path over local path for downsteam processing
            return Checkpoint.from_uri(cloud_path)
        elif os.path.exists(best_path):
            return Checkpoint.from_directory(best_path)
        else:
            logger.error(
                f"No checkpoint locations for {trial} available on "
                f"this node. To avoid this, you "
                f"should enable checkpoint synchronization with the"
                f"`sync_config` argument in Ray Tune. "
                f"The checkpoint may be available on a different node - "
                f"please check this location on worker nodes: {best_path}")
            return None
Пример #8
0
        def train_func(config, checkpoint_dir=None):
            # config already contains merged values.
            # Instantiate new Trainer in Trainable.
            trainer = trainer_cls(**config)

            if checkpoint_dir:
                trainer.resume_from_checkpoint = Checkpoint.from_directory(
                    checkpoint_dir)

            trainer.setup()
            trainer.preprocess_datasets()
            trainer.training_loop()
Пример #9
0
def test_predict_no_preprocessor():
    with tempfile.TemporaryDirectory() as tmpdir:
        with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f:
            cpickle.dump(model, f)

        checkpoint = Checkpoint.from_directory(tmpdir)
        predictor = SklearnPredictor.from_checkpoint(checkpoint)

    data_batch = np.array([[1, 2], [3, 4], [5, 6]])
    predictions = predictor.predict(data_batch)

    assert len(predictions) == 3
Пример #10
0
    def _trial_to_result(self, trial: Trial) -> Result:
        if trial.checkpoint.value:
            checkpoint_dir = TrainableUtil.find_checkpoint_dir(trial.checkpoint.value)
            checkpoint = Checkpoint.from_directory(checkpoint_dir)
        else:
            checkpoint = None

        result = Result(
            checkpoint=checkpoint,
            metrics=trial.last_result.copy(),
            error=self._populate_exception(trial),
        )
        return result
Пример #11
0
    def test_dict_checkpoint_fs(self):
        """Test conversion from dict to FS checkpoint and back."""
        checkpoint = self._prepare_dict_checkpoint()

        # Convert into fs checkpoint
        path = checkpoint.to_directory()
        self.assertIsInstance(path, str)

        # Create from path
        checkpoint = Checkpoint.from_directory(path)
        self.assertTrue(checkpoint._local_path)

        self._assert_dict_checkpoint(checkpoint)
Пример #12
0
def test_batch_prediction_with_set_cpus(ray_start_4_cpus):
    with tempfile.TemporaryDirectory() as tmpdir:
        with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f:
            cpickle.dump(model, f)

        checkpoint = Checkpoint.from_directory(tmpdir)

        batch_predictor = BatchPredictor.from_checkpoint(
            checkpoint, SklearnPredictor)

        test_dataset = ray.data.from_pandas(
            pd.DataFrame(dummy_data, columns=["A", "B"]))
        batch_predictor.predict(test_dataset,
                                num_cpus_per_worker=2,
                                num_estimator_cpus=2)
Пример #13
0
def create_checkpoint(preprocessor: Optional[Preprocessor] = None,
                      config: Optional[dict] = None) -> Checkpoint:
    rl_trainer = RLTrainer(
        algorithm=_DummyTrainer,
        config=config or {},
        preprocessor=preprocessor,
    )
    rl_trainable_cls = rl_trainer.as_trainable()
    rl_trainable = rl_trainable_cls()

    with tempfile.TemporaryDirectory() as checkpoint_dir:
        checkpoint_file = rl_trainable.save(checkpoint_dir)
        checkpoint_path = TrainableUtil.find_checkpoint_dir(checkpoint_file)
        checkpoint_data = Checkpoint.from_directory(checkpoint_path).to_dict()

    return Checkpoint.from_dict(checkpoint_data)
Пример #14
0
def test_predict_no_preprocessor():
    with tempfile.TemporaryDirectory() as tmpdir:
        # This somewhat convoluted procedure is the same as in the
        # Trainers. The reason for saving model to disk instead
        # of directly to the dict as bytes is due to all callbacks
        # following save to disk logic. GBDT models are small
        # enough that IO should not be an issue.
        model.save_model(os.path.join(tmpdir, MODEL_KEY))

        checkpoint = Checkpoint.from_directory(tmpdir)
        predictor = LightGBMPredictor.from_checkpoint(checkpoint)

    data_batch = np.array([[1, 2], [3, 4], [5, 6]])
    predictions = predictor.predict(data_batch)

    assert len(predictions) == 3
Пример #15
0
    def _maybe_save_to_cloud(self, checkpoint_dir: str):
        # Derived classes like the FunctionRunner might call this
        if self.uses_cloud_checkpointing:
            if self.storage_client:
                # Keep for backwards compatibility, remove after deprecation
                self.storage_client.sync_up(checkpoint_dir,
                                            self._storage_path(checkpoint_dir))
                self.storage_client.wait_or_retry()
                return

            checkpoint = Checkpoint.from_directory(checkpoint_dir)
            retry_fn(
                lambda: checkpoint.to_uri(self._storage_path(checkpoint_dir)),
                subprocess.CalledProcessError,
                num_retries=3,
                sleep_time=1,
            )
Пример #16
0
def test_init():
    preprocessor = DummyPreprocessor()
    preprocessor.attr = 1
    predictor = SklearnPredictor(estimator=model, preprocessor=preprocessor)

    with tempfile.TemporaryDirectory() as tmpdir:
        with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f:
            cpickle.dump(model, f)
        save_preprocessor_to_dir(preprocessor, tmpdir)

        checkpoint = Checkpoint.from_directory(tmpdir)
        checkpoint_predictor = SklearnPredictor.from_checkpoint(checkpoint)

    assert np.allclose(
        checkpoint_predictor.estimator.feature_importances_,
        predictor.estimator.feature_importances_,
    )
    assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr
Пример #17
0
def test_init():
    preprocessor = DummyPreprocessor()
    preprocessor.attr = 1
    predictor = LightGBMPredictor(model=model, preprocessor=preprocessor)

    with tempfile.TemporaryDirectory() as tmpdir:
        # This somewhat convoluted procedure is the same as in the
        # Trainers. The reason for saving model to disk instead
        # of directly to the dict as bytes is due to all callbacks
        # following save to disk logic. GBDT models are small
        # enough that IO should not be an issue.
        model.save_model(os.path.join(tmpdir, MODEL_KEY))
        checkpoint = Checkpoint.from_dict({PREPROCESSOR_KEY: preprocessor})
        checkpoint.to_directory(path=tmpdir)

        checkpoint = Checkpoint.from_directory(tmpdir)
        checkpoint_predictor = LightGBMPredictor.from_checkpoint(checkpoint)

    assert get_num_trees(checkpoint_predictor.model) == get_num_trees(predictor.model)
    assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr
Пример #18
0
def test_preprocessor_in_checkpoint(ray_start_4_cpus, tmpdir):
    train_dataset = ray.data.from_pandas(train_df)
    valid_dataset = ray.data.from_pandas(test_df)

    class DummyPreprocessor(Preprocessor):
        def __init__(self):
            super().__init__()
            self.is_same = True

        def fit(self, dataset):
            self.fitted_ = True

        def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame":
            return df

    trainer = SklearnTrainer(
        estimator=RandomForestClassifier(),
        scaling_config=scale_config,
        label_column="target",
        datasets={
            TRAIN_DATASET_KEY: train_dataset,
            "valid": valid_dataset
        },
        preprocessor=DummyPreprocessor(),
    )
    result = trainer.fit()

    # Move checkpoint to a different directory.
    checkpoint_dict = result.checkpoint.to_dict()
    checkpoint = Checkpoint.from_dict(checkpoint_dict)
    checkpoint_path = checkpoint.to_directory(tmpdir)
    resume_from = Checkpoint.from_directory(checkpoint_path)

    model, preprocessor = load_from_checkpoint(resume_from)
    assert hasattr(model, "feature_importances_")
    assert preprocessor.is_same
    assert preprocessor.fitted_
Пример #19
0
def _serialize_checkpoint(checkpoint_path) -> bytes:
    checkpoint = Checkpoint.from_directory(checkpoint_path)
    return checkpoint.to_bytes()