def test_tune(ray_start_4_cpus): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) trainer = LightGBMTrainer( scaling_config=scale_config, label_column="target", params={ **params, **{ "max_depth": 1 } }, datasets={ TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset }, ) tune.run( trainer.as_trainable(), config={"params": { "max_depth": tune.randint(2, 4) }}, num_samples=2, ) # Make sure original Trainer is not affected. assert trainer.params["max_depth"] == 1
def train_lightgbm(num_workers: int, use_gpu: bool = False) -> Result: train_dataset, valid_dataset, _ = prepare_data() # Scale some random columns columns_to_scale = ["mean radius", "mean texture"] preprocessor = StandardScaler(columns=columns_to_scale) # LightGBM specific params params = { "objective": "binary", "metric": ["binary_logloss", "binary_error"], } trainer = LightGBMTrainer( scaling_config={ "num_workers": num_workers, "use_gpu": use_gpu, }, label_column="target", params=params, datasets={"train": train_dataset, "valid": valid_dataset}, preprocessor=preprocessor, num_boost_round=100, ) result = trainer.fit() print(result.metrics) return result
def test_preprocessor_in_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) class DummyPreprocessor(Preprocessor): def __init__(self): super().__init__() self.is_same = True def fit(self, dataset): self.fitted_ = True def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame": return df trainer = LightGBMTrainer( scaling_config=scale_config, label_column="target", params=params, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, preprocessor=DummyPreprocessor(), ) result = trainer.fit() # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) model, preprocessor = load_checkpoint(resume_from) assert get_num_trees(model) == 10 assert preprocessor.is_same assert preprocessor.fitted_
def test_fit(ray_start_4_cpus): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) trainer = LightGBMTrainer( scaling_config=scale_config, label_column="target", params=params, datasets={ TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset }, ) trainer.fit()
def test_validation(ray_start_4_cpus): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) with pytest.raises(KeyError, match=TRAIN_DATASET_KEY): LightGBMTrainer( scaling_config={"num_workers": 2}, label_column="target", params=params, datasets={"valid": valid_dataset}, ) with pytest.raises(KeyError, match="dmatrix_params"): LightGBMTrainer( scaling_config={"num_workers": 2}, label_column="target", params=params, dmatrix_params={"data": {}}, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, )
def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) trainer = LightGBMTrainer( scaling_config=scale_config, label_column="target", params=params, num_boost_round=5, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, ) result = trainer.fit() checkpoint = result.checkpoint model, _ = load_checkpoint(checkpoint) assert get_num_trees(model) == 5 # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) trainer = LightGBMTrainer( scaling_config=scale_config, label_column="target", params=params, num_boost_round=5, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, resume_from_checkpoint=resume_from, ) result = trainer.fit() checkpoint = result.checkpoint xgb_model, _ = load_checkpoint(checkpoint) assert get_num_trees(xgb_model) == 10
def test_fit_with_categoricals(ray_start_4_cpus): train_df_with_cat = train_df.copy() test_df_with_cat = test_df.copy() train_df_with_cat["categorical_column"] = pd.Series( (["A", "B"] * math.ceil(len(train_df_with_cat) / 2))[: len(train_df_with_cat)] ).astype("category") test_df_with_cat["categorical_column"] = pd.Series( (["A", "B"] * math.ceil(len(test_df_with_cat) / 2))[: len(test_df_with_cat)] ).astype("category") train_dataset = ray.data.from_pandas(train_df_with_cat) valid_dataset = ray.data.from_pandas(test_df_with_cat) trainer = LightGBMTrainer( scaling_config=scale_config, label_column="target", params=params, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, ) result = trainer.fit() checkpoint = result.checkpoint model, _ = load_checkpoint(checkpoint) assert model.pandas_categorical == [["A", "B"]]