def test_train_feature_applicator_with_latency(self):
        input_data = pd.DataFrame(
            index=pd.to_datetime([
                "2020-02-01 10:00:00",
                "2020-02-01 10:30:00",
                "2020-02-01 11:00:00",
                "2020-02-01 11:30:00",
            ]),
            data={
                "load": [10, 15, 20, 15],
                "APX": [1, 2, 3, 4],
            },
        )
        horizons = [0.25, 47]

        input_data_with_features = TrainFeatureApplicator(
            horizons=horizons).add_features(input_data)

        horizon = input_data_with_features.horizon

        # Skip first row, since T-30min not available for first row
        self.assertTrue(input_data_with_features.loc[
            horizon == 47, ["APX", "T-30min"]].iloc[1:, ].isna().all().all())
        self.assertFalse(input_data_with_features.loc[
            horizon == 0.25, ["APX", "T-30min"]].iloc[1:, ].isna().any().any())
    def test_train_model_pipeline_core_happy_flow_nfold(self):
        """Test happy flow of the train model pipeline, using cross validation to forecast the entire input range"""

        (
            forecast,
            model,
            train_data,
            validation_data,
            test_data,
        ) = train_model_and_forecast_back_test(
            pj=self.pj,
            modelspecs=self.modelspecs,
            input_data=self.train_input,
            training_horizons=[0.25, 24.0],
            n_folds=4,
        )

        self.assertTrue("forecast" in forecast.columns)
        self.assertTrue("realised" in forecast.columns)
        self.assertTrue("horizon" in forecast.columns)
        self.assertEqual(sorted(list(forecast.horizon.unique())), [0.25, 24.0])

        # check if forecast is indeed of the entire range of the input data
        validated_data = validation.drop_target_na(
            validation.validate(self.pj["id"], self.train_input,
                                self.pj["flatliner_treshold"]))
        data_with_features = TrainFeatureApplicator(
            horizons=[0.25, 24.0],
            feature_names=self.modelspecs.feature_names).add_features(
                validated_data, pj=self.pj)
        self.assertEqual(len(forecast), len(data_with_features))
예제 #3
0
def train_pipeline_step_compute_features(
    pj: PredictionJobDataClass,
    model_specs: ModelSpecificationDataClass,
    input_data: pd.DataFrame,
    horizons=List[float],
) -> pd.DataFrame:
    """Compute features and perform consistency checks

    Args:
        pj (PredictionJobDataClass): Prediction job
        model_specs (ModelSpecificationDataClass): Dataclass containing model specifications
        input_data (pd.DataFrame): Input data
        horizons (List[float]): horizons to train on in hours.

    Returns:
        data_with_features (pd.DataFrame): The dataframe with features need to train the model

    Raises:
        InputDataInsufficientError: when input data is insufficient.
        InputDataWrongColumnOrderError: when input data has a invalid column order.
        ValueError: when the horizon is a string and the corresponding column in not in the input data

    """
    if pj["model"] == "proloaf":
        # proloaf is only able to train with one horizon
        horizons = [horizons[0]]

    if input_data.empty:
        raise InputDataInsufficientError("Input dataframe is empty")
    elif "load" not in input_data.columns:
        raise InputDataWrongColumnOrderError(
            "Missing the load column in the input dataframe")

    if isinstance(horizons, str):
        if not (horizons in set(input_data.columns)):
            raise ValueError(
                f"The horizon parameter specifies a column name ({horizons}) missing in"
                " the input data.")
        else:
            # sort data to avoid same date repeated multiple time
            input_data = input_data.sort_values(horizons)
    # Validate and clean data
    validated_data = validation.drop_target_na(
        validation.validate(pj["id"], input_data, pj["flatliner_treshold"]))
    # Check if sufficient data is left after cleaning
    if not validation.is_data_sufficient(validated_data,
                                         pj["completeness_treshold"],
                                         pj["minimal_table_length"]):
        raise InputDataInsufficientError(
            "Input data is insufficient, after validation and cleaning")

    data_with_features = TrainFeatureApplicator(
        horizons=horizons,
        feature_names=model_specs.feature_names,
        feature_modules=model_specs.feature_modules,
    ).add_features(validated_data, pj=pj)

    return data_with_features
 def test_train_feature_applicator_correct_order_historic_load(self):
     # Test for expected column order of the output and test for expected historic_load column
     pj = {"model": "proloaf"}
     data_with_features = TrainFeatureApplicator(
         horizons=[0.25, 24.0]).add_features(self.input_data[["load"]],
                                             pj=pj)
     self.assertTrue(
         "historic_load" in data_with_features.columns.to_list())
     self.assertEqual(data_with_features.columns.to_list()[0], "load")
     self.assertEqual(data_with_features.columns.to_list()[-1], "horizon")
 def test_train_feature_applicator_custom_horizon(self):
     input_data = self.input_data.copy(deep=True)
     input_data["custom_horizon"] = 0
     data_with_features = TrainFeatureApplicator(
         horizons="custom_horizon").add_features(input_data)
     self.assertEqual(data_with_features.columns.to_list()[0], "load")
     self.assertEqual(data_with_features.columns.to_list()[-1], "horizon")
     self.assertTrue(
         (data_with_features["horizon"] == input_data["custom_horizon"]
          ).all())
    def test_train_feature_applicator(self):

        input_data_with_features = TrainFeatureApplicator(
            horizons=[0.25]).add_features(TestData.load("input_data.pickle"))

        self.assertDataframeEqual(
            input_data_with_features,
            TestData.load("input_data_multi_horizon_features.csv"),
            check_like=True,  # ignore the order of index & columns
        )
    def test_train_feature_applicator_filter_features(self):
        # Test for expected column order of the output
        # Also check "horizons" is not in the output
        features = self.input_data.columns.to_list()[:15]
        data_with_features = TrainFeatureApplicator(
            horizons=[0.25, 24.0],
            feature_names=features).add_features(self.input_data)

        self.assertIn("horizon", data_with_features.columns.to_list())
        self.assertListEqual(
            list(np.sort(features + ["horizon"])),
            list(np.sort(data_with_features.columns.to_list())),
        )
    def test_call(self):
        input_data = TestData.load("reference_sets/307-train-data.csv")
        pj = {"model": "proloaf"}
        input_data_with_features = TrainFeatureApplicator(
            horizons=[24.0]).add_features(input_data, pj=pj)

        model_type = "proloaf"
        model = ModelCreator.create_model(model_type)

        objective = ProLoafRegressorObjective(
            model,
            input_data_with_features,
        )
        study = optuna.create_study(
            study_name=model_type,
            pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
            direction="minimize",
        )
        study.optimize(objective, n_trials=1)

        self.assertIsInstance(objective, ProLoafRegressorObjective)
        self.assertEqual(len(study.trials), 1)
def optimize_hyperparameters_pipeline_core(
    pj: PredictionJobDataClass,
    input_data: pd.DataFrame,
    horizons: List[float] = DEFAULT_TRAIN_HORIZONS,
    n_trials: int = N_TRIALS,
) -> Tuple[OpenstfRegressor, ModelSpecificationDataClass, Report, dict, int,
           dict[str, Any]]:
    """Optimize hyperparameters pipeline core.

    Expected prediction job key's: "name", "model"

    Args:
        pj (PredictionJobDataClass): Prediction job
        input_data (pd.DataFrame): Raw training input data
        horizons (List[float]): horizons for feature engineering.
        n_trials (int, optional): The number of trials. Defaults to N_TRIALS.

    Raises:
        ValueError: If the input_date is insufficient.

    Returns:
        OpenstfRegressor: Best model,
        ModelSpecificationDataClass: Model specifications of the best model,
        Report: Report of the best training round,
        dict: Trials,
        int: Best trial number,
        dict: Optimized hyperparameters.
    """
    if input_data.empty:
        raise InputDataInsufficientError("Input dataframe is empty")
    elif "load" not in input_data.columns:
        raise InputDataWrongColumnOrderError(
            "Missing the load column in the input dataframe")

    # Validate and clean data
    validated_data = validation.drop_target_na(
        validation.validate(pj["id"], input_data, pj["flatliner_treshold"]))

    # Check if sufficient data is left after cleaning
    if not validation.is_data_sufficient(validated_data,
                                         pj["completeness_treshold"],
                                         pj["minimal_table_length"]):
        raise InputDataInsufficientError(
            f"Input data is insufficient for {pj['name']} after validation and cleaning"
        )

    if pj.default_modelspecs:
        feature_names = (pj.default_modelspecs.feature_names, )
        feature_modules = pj.default_modelspecs.feature_modules
    else:
        feature_names = None
        feature_modules = []

    validated_data_with_features = TrainFeatureApplicator(
        horizons=horizons,
        feature_names=feature_names,
        feature_modules=feature_modules).add_features(validated_data, pj=pj)

    # Adds additional proloaf features to the input data, historic_load (equal to the load, first column)
    if pj["model"] == "proloaf" and "historic_load" not in list(
            validated_data_with_features.columns):
        validated_data_with_features[
            "historic_load"] = validated_data_with_features.iloc[:, 0]
        # Make sure horizons is last column
        temp_cols = validated_data_with_features.columns.tolist()
        new_cols = temp_cols[:-2] + [temp_cols[-1]] + [temp_cols[-2]]
        validated_data_with_features = validated_data_with_features[new_cols]

    # Create objective (NOTE: this is a callable class)
    objective = ObjectiveCreator.create_objective(model_type=pj["model"])

    study, objective = optuna_optimization(pj, objective,
                                           validated_data_with_features,
                                           n_trials)

    best_hyperparams = study.best_params
    best_model = study.user_attrs["best_model"]

    logger.info(
        f"Finished hyperparameter optimization, error objective {study.best_value} "
        f"and params {best_hyperparams}")

    # Add quantiles to hyperparams so they are stored with the model info
    if pj["quantiles"]:
        best_hyperparams.update(quantiles=pj["quantiles"])

    # model specification
    model_specs = ModelSpecificationDataClass(
        id=pj["id"],
        feature_names=list(validated_data_with_features.columns),
        hyper_params=best_hyperparams,
    )

    # If the model type is quantile, train a model with the best parameters for all quantiles
    # (optimization is only done for quantile 0.5)
    if objective.model.can_predict_quantiles:
        best_model, report, modelspecs, _ = train_model_pipeline_core(
            pj=pj, input_data=input_data, model_specs=model_specs)

    # Save model and report. Report is always saved to MLFlow and optionally to disk
    report = objective.create_report(model=best_model)

    trials = objective.get_trial_track()
    best_trial_number = study.best_trial.number

    return best_model, model_specs, report, trials, best_trial_number, study.best_params
 def test_train_feature_applicator_correct_order(self):
     # Test for expected column order of the output
     data_with_features = TrainFeatureApplicator(
         horizons=[0.25, 24.0]).add_features(self.input_data[["load"]])
     self.assertEqual(data_with_features.columns.to_list()[0], "load")
     self.assertEqual(data_with_features.columns.to_list()[-1], "horizon")
import optuna

from openstef.feature_engineering.feature_applicator import TrainFeatureApplicator
from openstef.model.model_creator import ModelCreator
from openstef.model.objective import (
    LGBRegressorObjective,
    LinearRegressorObjective,
    ProLoafRegressorObjective,
    RegressorObjective,
    XGBQuantileRegressorObjective,
    XGBRegressorObjective,
)

input_data = TestData.load("reference_sets/307-train-data.csv")
input_data_with_features = TrainFeatureApplicator(
    horizons=[0.25, 24.0]).add_features(input_data)
# Select 50 data points to speedup test
input_data_with_features = input_data_with_features.iloc[::50, :]

N_TRIALS = 2


class TestRegressorObjective(BaseTestCase):
    def test_call(self):
        model_type = "xgb"
        model = ModelCreator.create_model(model_type)

        objective = RegressorObjective(
            model,
            input_data_with_features,
        )
    def test_train_model_pipeline_core_custom_split(self):

        pj = self.pj
        # test wrong custom backtest split
        pj.backtest_split_func = SplitFuncDataClass(function="unknow_backtest",
                                                    arguments={})
        with self.assertRaises(ValueError):
            (
                forecast,
                model,
                train_data,
                validation_data,
                test_data,
            ) = train_model_and_forecast_back_test(
                pj=self.pj,
                modelspecs=self.modelspecs,
                input_data=self.train_input,
                training_horizons=[0.25, 24.0],
                n_folds=4,
            )

        pj.backtest_split_func = SplitFuncDataClass(
            function=lambda data: timeseries_split(data, 0, 24), arguments={})
        with self.assertRaises(ValueError):
            (
                forecast,
                model,
                train_data,
                validation_data,
                test_data,
            ) = train_model_and_forecast_back_test(
                pj=self.pj,
                modelspecs=self.modelspecs,
                input_data=self.train_input,
                training_horizons=[0.25, 24.0],
                n_folds=4,
            )

        # test custom backtest split
        pj.backtest_split_func = SplitFuncDataClass(function=timeseries_split,
                                                    arguments={"gap": 24})
        (
            forecast,
            model,
            train_data,
            validation_data,
            test_data,
        ) = train_model_and_forecast_back_test(
            pj=self.pj,
            modelspecs=self.modelspecs,
            input_data=self.train_input,
            training_horizons=[0.25, 24.0],
            n_folds=4,
        )

        self.assertTrue("forecast" in forecast.columns)
        self.assertTrue("realised" in forecast.columns)
        self.assertTrue("horizon" in forecast.columns)
        self.assertEqual(sorted(list(forecast.horizon.unique())), [0.25, 24.0])

        # check if forecast is indeed of the entire range of the input data
        test_fraction = 0.15
        nb_test = int(np.round(test_fraction * len(self.train_input)))
        validated_data = validation.drop_target_na(
            validation.validate(
                self.pj["id"],
                self.train_input[-nb_test:],
                self.pj["flatliner_treshold"],
            ))
        data_with_features = TrainFeatureApplicator(
            horizons=[0.25, 24.0],
            feature_names=self.modelspecs.feature_names).add_features(
                validated_data, pj=self.pj)
        self.assertEqual(len(forecast), 4 * len(data_with_features))
예제 #13
0
    def test_train_model_pipeline_core_happy_flow(self):
        """Test happy flow of the train model pipeline

        NOTE this does not explain WHY this is the case?
        The input data should not contain features (e.g. T-7d),
        but it can/should include predictors (e.g. weather data)

        """
        # Select 50 data points to speedup test
        train_input = self.train_input.iloc[::50, :]
        for model_type in list(MLModelType) + [__name__ + ".DummyRegressor"]:
            with self.subTest(model_type=model_type):
                pj = self.pj

                pj["model"] = (
                    model_type.value if hasattr(model_type, "value") else model_type
                )
                model_specs = self.model_specs
                train_input = self.train_input

                # Use default parameters
                model_specs.hyper_params = {}
                model_specs.hyper_params["max_epochs"] = 1

                # For Linear model we need to choose an imputation strategy to handle missing value
                if model_type == MLModelType.LINEAR:
                    model_specs.hyper_params["imputation_strategy"] = "mean"

                model, report, modelspecs, _ = train_model_pipeline_core(
                    pj=pj, model_specs=model_specs, input_data=train_input
                )

                # check if the model was fitted (raises NotFittedError when not fitted)
                self.assertIsNone(sklearn.utils.validation.check_is_fitted(model))

                # check if the model has a feature_names property
                self.assertIsNotNone(model.feature_names)

                # check if model is sklearn compatible
                self.assertTrue(isinstance(model, sklearn.base.BaseEstimator))

                # check if report is a Report
                self.assertTrue(isinstance(report, Report))

                # Validate and clean data
                validated_data = validation.drop_target_na(
                    validation.validate(pj["id"], train_input, flatliner_threshold=24)
                )

                # Add features
                data_with_features = TrainFeatureApplicator(
                    horizons=[0.25, 47.0], feature_names=model_specs.feature_names
                ).add_features(validated_data, pj=pj)

                # Split data
                (
                    train_data,
                    validation_data,
                    test_data,
                ) = split_data_train_validation_test(data_with_features)

                # not able to generate a feature importance for proloaf as this is a neural network
                if not pj["model"] == "proloaf":
                    importance = model.set_feature_importance()
                    self.assertIsInstance(importance, pd.DataFrame)