def test_train_feature_applicator_with_latency(self): input_data = pd.DataFrame( index=pd.to_datetime([ "2020-02-01 10:00:00", "2020-02-01 10:30:00", "2020-02-01 11:00:00", "2020-02-01 11:30:00", ]), data={ "load": [10, 15, 20, 15], "APX": [1, 2, 3, 4], }, ) horizons = [0.25, 47] input_data_with_features = TrainFeatureApplicator( horizons=horizons).add_features(input_data) horizon = input_data_with_features.horizon # Skip first row, since T-30min not available for first row self.assertTrue(input_data_with_features.loc[ horizon == 47, ["APX", "T-30min"]].iloc[1:, ].isna().all().all()) self.assertFalse(input_data_with_features.loc[ horizon == 0.25, ["APX", "T-30min"]].iloc[1:, ].isna().any().any())
def test_train_model_pipeline_core_happy_flow_nfold(self): """Test happy flow of the train model pipeline, using cross validation to forecast the entire input range""" ( forecast, model, train_data, validation_data, test_data, ) = train_model_and_forecast_back_test( pj=self.pj, modelspecs=self.modelspecs, input_data=self.train_input, training_horizons=[0.25, 24.0], n_folds=4, ) self.assertTrue("forecast" in forecast.columns) self.assertTrue("realised" in forecast.columns) self.assertTrue("horizon" in forecast.columns) self.assertEqual(sorted(list(forecast.horizon.unique())), [0.25, 24.0]) # check if forecast is indeed of the entire range of the input data validated_data = validation.drop_target_na( validation.validate(self.pj["id"], self.train_input, self.pj["flatliner_treshold"])) data_with_features = TrainFeatureApplicator( horizons=[0.25, 24.0], feature_names=self.modelspecs.feature_names).add_features( validated_data, pj=self.pj) self.assertEqual(len(forecast), len(data_with_features))
def train_pipeline_step_compute_features( pj: PredictionJobDataClass, model_specs: ModelSpecificationDataClass, input_data: pd.DataFrame, horizons=List[float], ) -> pd.DataFrame: """Compute features and perform consistency checks Args: pj (PredictionJobDataClass): Prediction job model_specs (ModelSpecificationDataClass): Dataclass containing model specifications input_data (pd.DataFrame): Input data horizons (List[float]): horizons to train on in hours. Returns: data_with_features (pd.DataFrame): The dataframe with features need to train the model Raises: InputDataInsufficientError: when input data is insufficient. InputDataWrongColumnOrderError: when input data has a invalid column order. ValueError: when the horizon is a string and the corresponding column in not in the input data """ if pj["model"] == "proloaf": # proloaf is only able to train with one horizon horizons = [horizons[0]] if input_data.empty: raise InputDataInsufficientError("Input dataframe is empty") elif "load" not in input_data.columns: raise InputDataWrongColumnOrderError( "Missing the load column in the input dataframe") if isinstance(horizons, str): if not (horizons in set(input_data.columns)): raise ValueError( f"The horizon parameter specifies a column name ({horizons}) missing in" " the input data.") else: # sort data to avoid same date repeated multiple time input_data = input_data.sort_values(horizons) # Validate and clean data validated_data = validation.drop_target_na( validation.validate(pj["id"], input_data, pj["flatliner_treshold"])) # Check if sufficient data is left after cleaning if not validation.is_data_sufficient(validated_data, pj["completeness_treshold"], pj["minimal_table_length"]): raise InputDataInsufficientError( "Input data is insufficient, after validation and cleaning") data_with_features = TrainFeatureApplicator( horizons=horizons, feature_names=model_specs.feature_names, feature_modules=model_specs.feature_modules, ).add_features(validated_data, pj=pj) return data_with_features
def test_train_feature_applicator_correct_order_historic_load(self): # Test for expected column order of the output and test for expected historic_load column pj = {"model": "proloaf"} data_with_features = TrainFeatureApplicator( horizons=[0.25, 24.0]).add_features(self.input_data[["load"]], pj=pj) self.assertTrue( "historic_load" in data_with_features.columns.to_list()) self.assertEqual(data_with_features.columns.to_list()[0], "load") self.assertEqual(data_with_features.columns.to_list()[-1], "horizon")
def test_train_feature_applicator_custom_horizon(self): input_data = self.input_data.copy(deep=True) input_data["custom_horizon"] = 0 data_with_features = TrainFeatureApplicator( horizons="custom_horizon").add_features(input_data) self.assertEqual(data_with_features.columns.to_list()[0], "load") self.assertEqual(data_with_features.columns.to_list()[-1], "horizon") self.assertTrue( (data_with_features["horizon"] == input_data["custom_horizon"] ).all())
def test_train_feature_applicator(self): input_data_with_features = TrainFeatureApplicator( horizons=[0.25]).add_features(TestData.load("input_data.pickle")) self.assertDataframeEqual( input_data_with_features, TestData.load("input_data_multi_horizon_features.csv"), check_like=True, # ignore the order of index & columns )
def test_train_feature_applicator_filter_features(self): # Test for expected column order of the output # Also check "horizons" is not in the output features = self.input_data.columns.to_list()[:15] data_with_features = TrainFeatureApplicator( horizons=[0.25, 24.0], feature_names=features).add_features(self.input_data) self.assertIn("horizon", data_with_features.columns.to_list()) self.assertListEqual( list(np.sort(features + ["horizon"])), list(np.sort(data_with_features.columns.to_list())), )
def test_call(self): input_data = TestData.load("reference_sets/307-train-data.csv") pj = {"model": "proloaf"} input_data_with_features = TrainFeatureApplicator( horizons=[24.0]).add_features(input_data, pj=pj) model_type = "proloaf" model = ModelCreator.create_model(model_type) objective = ProLoafRegressorObjective( model, input_data_with_features, ) study = optuna.create_study( study_name=model_type, pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="minimize", ) study.optimize(objective, n_trials=1) self.assertIsInstance(objective, ProLoafRegressorObjective) self.assertEqual(len(study.trials), 1)
def optimize_hyperparameters_pipeline_core( pj: PredictionJobDataClass, input_data: pd.DataFrame, horizons: List[float] = DEFAULT_TRAIN_HORIZONS, n_trials: int = N_TRIALS, ) -> Tuple[OpenstfRegressor, ModelSpecificationDataClass, Report, dict, int, dict[str, Any]]: """Optimize hyperparameters pipeline core. Expected prediction job key's: "name", "model" Args: pj (PredictionJobDataClass): Prediction job input_data (pd.DataFrame): Raw training input data horizons (List[float]): horizons for feature engineering. n_trials (int, optional): The number of trials. Defaults to N_TRIALS. Raises: ValueError: If the input_date is insufficient. Returns: OpenstfRegressor: Best model, ModelSpecificationDataClass: Model specifications of the best model, Report: Report of the best training round, dict: Trials, int: Best trial number, dict: Optimized hyperparameters. """ if input_data.empty: raise InputDataInsufficientError("Input dataframe is empty") elif "load" not in input_data.columns: raise InputDataWrongColumnOrderError( "Missing the load column in the input dataframe") # Validate and clean data validated_data = validation.drop_target_na( validation.validate(pj["id"], input_data, pj["flatliner_treshold"])) # Check if sufficient data is left after cleaning if not validation.is_data_sufficient(validated_data, pj["completeness_treshold"], pj["minimal_table_length"]): raise InputDataInsufficientError( f"Input data is insufficient for {pj['name']} after validation and cleaning" ) if pj.default_modelspecs: feature_names = (pj.default_modelspecs.feature_names, ) feature_modules = pj.default_modelspecs.feature_modules else: feature_names = None feature_modules = [] validated_data_with_features = TrainFeatureApplicator( horizons=horizons, feature_names=feature_names, feature_modules=feature_modules).add_features(validated_data, pj=pj) # Adds additional proloaf features to the input data, historic_load (equal to the load, first column) if pj["model"] == "proloaf" and "historic_load" not in list( validated_data_with_features.columns): validated_data_with_features[ "historic_load"] = validated_data_with_features.iloc[:, 0] # Make sure horizons is last column temp_cols = validated_data_with_features.columns.tolist() new_cols = temp_cols[:-2] + [temp_cols[-1]] + [temp_cols[-2]] validated_data_with_features = validated_data_with_features[new_cols] # Create objective (NOTE: this is a callable class) objective = ObjectiveCreator.create_objective(model_type=pj["model"]) study, objective = optuna_optimization(pj, objective, validated_data_with_features, n_trials) best_hyperparams = study.best_params best_model = study.user_attrs["best_model"] logger.info( f"Finished hyperparameter optimization, error objective {study.best_value} " f"and params {best_hyperparams}") # Add quantiles to hyperparams so they are stored with the model info if pj["quantiles"]: best_hyperparams.update(quantiles=pj["quantiles"]) # model specification model_specs = ModelSpecificationDataClass( id=pj["id"], feature_names=list(validated_data_with_features.columns), hyper_params=best_hyperparams, ) # If the model type is quantile, train a model with the best parameters for all quantiles # (optimization is only done for quantile 0.5) if objective.model.can_predict_quantiles: best_model, report, modelspecs, _ = train_model_pipeline_core( pj=pj, input_data=input_data, model_specs=model_specs) # Save model and report. Report is always saved to MLFlow and optionally to disk report = objective.create_report(model=best_model) trials = objective.get_trial_track() best_trial_number = study.best_trial.number return best_model, model_specs, report, trials, best_trial_number, study.best_params
def test_train_feature_applicator_correct_order(self): # Test for expected column order of the output data_with_features = TrainFeatureApplicator( horizons=[0.25, 24.0]).add_features(self.input_data[["load"]]) self.assertEqual(data_with_features.columns.to_list()[0], "load") self.assertEqual(data_with_features.columns.to_list()[-1], "horizon")
import optuna from openstef.feature_engineering.feature_applicator import TrainFeatureApplicator from openstef.model.model_creator import ModelCreator from openstef.model.objective import ( LGBRegressorObjective, LinearRegressorObjective, ProLoafRegressorObjective, RegressorObjective, XGBQuantileRegressorObjective, XGBRegressorObjective, ) input_data = TestData.load("reference_sets/307-train-data.csv") input_data_with_features = TrainFeatureApplicator( horizons=[0.25, 24.0]).add_features(input_data) # Select 50 data points to speedup test input_data_with_features = input_data_with_features.iloc[::50, :] N_TRIALS = 2 class TestRegressorObjective(BaseTestCase): def test_call(self): model_type = "xgb" model = ModelCreator.create_model(model_type) objective = RegressorObjective( model, input_data_with_features, )
def test_train_model_pipeline_core_custom_split(self): pj = self.pj # test wrong custom backtest split pj.backtest_split_func = SplitFuncDataClass(function="unknow_backtest", arguments={}) with self.assertRaises(ValueError): ( forecast, model, train_data, validation_data, test_data, ) = train_model_and_forecast_back_test( pj=self.pj, modelspecs=self.modelspecs, input_data=self.train_input, training_horizons=[0.25, 24.0], n_folds=4, ) pj.backtest_split_func = SplitFuncDataClass( function=lambda data: timeseries_split(data, 0, 24), arguments={}) with self.assertRaises(ValueError): ( forecast, model, train_data, validation_data, test_data, ) = train_model_and_forecast_back_test( pj=self.pj, modelspecs=self.modelspecs, input_data=self.train_input, training_horizons=[0.25, 24.0], n_folds=4, ) # test custom backtest split pj.backtest_split_func = SplitFuncDataClass(function=timeseries_split, arguments={"gap": 24}) ( forecast, model, train_data, validation_data, test_data, ) = train_model_and_forecast_back_test( pj=self.pj, modelspecs=self.modelspecs, input_data=self.train_input, training_horizons=[0.25, 24.0], n_folds=4, ) self.assertTrue("forecast" in forecast.columns) self.assertTrue("realised" in forecast.columns) self.assertTrue("horizon" in forecast.columns) self.assertEqual(sorted(list(forecast.horizon.unique())), [0.25, 24.0]) # check if forecast is indeed of the entire range of the input data test_fraction = 0.15 nb_test = int(np.round(test_fraction * len(self.train_input))) validated_data = validation.drop_target_na( validation.validate( self.pj["id"], self.train_input[-nb_test:], self.pj["flatliner_treshold"], )) data_with_features = TrainFeatureApplicator( horizons=[0.25, 24.0], feature_names=self.modelspecs.feature_names).add_features( validated_data, pj=self.pj) self.assertEqual(len(forecast), 4 * len(data_with_features))
def test_train_model_pipeline_core_happy_flow(self): """Test happy flow of the train model pipeline NOTE this does not explain WHY this is the case? The input data should not contain features (e.g. T-7d), but it can/should include predictors (e.g. weather data) """ # Select 50 data points to speedup test train_input = self.train_input.iloc[::50, :] for model_type in list(MLModelType) + [__name__ + ".DummyRegressor"]: with self.subTest(model_type=model_type): pj = self.pj pj["model"] = ( model_type.value if hasattr(model_type, "value") else model_type ) model_specs = self.model_specs train_input = self.train_input # Use default parameters model_specs.hyper_params = {} model_specs.hyper_params["max_epochs"] = 1 # For Linear model we need to choose an imputation strategy to handle missing value if model_type == MLModelType.LINEAR: model_specs.hyper_params["imputation_strategy"] = "mean" model, report, modelspecs, _ = train_model_pipeline_core( pj=pj, model_specs=model_specs, input_data=train_input ) # check if the model was fitted (raises NotFittedError when not fitted) self.assertIsNone(sklearn.utils.validation.check_is_fitted(model)) # check if the model has a feature_names property self.assertIsNotNone(model.feature_names) # check if model is sklearn compatible self.assertTrue(isinstance(model, sklearn.base.BaseEstimator)) # check if report is a Report self.assertTrue(isinstance(report, Report)) # Validate and clean data validated_data = validation.drop_target_na( validation.validate(pj["id"], train_input, flatliner_threshold=24) ) # Add features data_with_features = TrainFeatureApplicator( horizons=[0.25, 47.0], feature_names=model_specs.feature_names ).add_features(validated_data, pj=pj) # Split data ( train_data, validation_data, test_data, ) = split_data_train_validation_test(data_with_features) # not able to generate a feature importance for proloaf as this is a neural network if not pj["model"] == "proloaf": importance = model.set_feature_importance() self.assertIsInstance(importance, pd.DataFrame)