def test_model_selector_holidays(country_code_column, country_code, holiday_step, error): ms = ModelSelector(frequency="D", horizon=1, country_code_column=country_code_column) if error is not None: with pytest.raises(error): ms.create_gridsearch(country_code=country_code) else: ms.create_gridsearch(country_code=country_code) assert isinstance(ms.grid_search.estimator.named_steps["holiday"], holiday_step) if holiday_step is not str: country_codes = [country_code] if isinstance( country_code, str) else (country_code or []) country_code_columns = ([country_code_column] if isinstance( country_code_column, str) else (country_code_column or [])) assert all([ isinstance(step[1], HolidayTransformer) for step in ms.grid_search.estimator.named_steps["holiday"].steps ]) assert all([ step[1].country_code == code for step, code in zip( ms.grid_search.estimator.named_steps["holiday"].steps, country_codes) ]) assert all([ step[1].country_code_column == col for step, col in zip( ms.grid_search.estimator.named_steps["holiday"].steps, country_code_columns) ])
def test_model_selector(tmp_path): n_regions = 1 n_plants = 1 n_products = 2 target_col_name = "Quantity" persist_path = os.path.join(tmp_path, "results") df = generate_multiple_tsdata( n_dates=200, n_regions=n_regions, n_plants=n_plants, n_products=n_products ) ms = ModelSelector(frequency="D", horizon=1, country_code_column="Country") assert ms.horizon == 1 ms.create_gridsearch( n_splits=1, prophet_models=True, sklearn_models=False, sklearn_models_optimize_for_horizon=False, autosarimax_models=False, tbats_models=False, exp_smooth_models=False, average_ensembles=False, stacking_ensembles=False, exog_cols=["Raining"], ) assert hasattr(ms, "grid_search") ms.add_model_to_gridsearch(get_sklearn_wrapper(LinearRegression)) ms.select_model( df=df, target_col_name=target_col_name, partition_columns=["Region", "Plant", "Product"], ) assert len(ms.results) == n_regions * n_plants * n_products assert len(ms.partitions) == n_regions * n_plants * n_products ms.persist_results(persist_path) print(ms.partitions) ms_load = load_model_selector(folder_path=persist_path) # we do not ensure the same order of results and partitions after loading, thus checking they are all there assert all([partition in ms_load.partitions for partition in ms.partitions]) # TODO redefine __eq__ for ModelSelectorResult to str(MSR).__dict__? assert all( [ str(ms_load.get_result_for_partition(partition).__dict__) == str(ms.get_result_for_partition(partition).__dict__) for partition in ms.partitions ] ) assert ms.horizon == ms_load.horizon assert ms.frequency == ms_load.frequency
def test_model_selector_holidays(country_code_column, country_code, holiday_step, error): ms = ModelSelector(frequency="D", horizon=1, country_code_column=country_code_column) if error is not None: with pytest.raises(error): ms.create_gridsearch(country_code=country_code) else: ms.create_gridsearch(country_code=country_code) assert isinstance(ms.grid_search.estimator.named_steps["holiday"], holiday_step) if holiday_step is not str: assert ms.grid_search.estimator.named_steps[ "holiday"].country_code == country_code assert ms.grid_search.estimator.named_steps[ "holiday"].country_code_column == country_code_column
def test_model_selector(tmp_path): n_regions = 1 n_plants = 1 n_products = 2 target_col_name = "Quantity" persist_path = os.path.join(tmp_path, "results") df = generate_multiple_tsdata(n_dates=200, n_regions=n_regions, n_plants=n_plants, n_products=n_products) ms = ModelSelector(frequency="D", horizon=1, country_code_column="Country") with pytest.raises(ValueError): ms.results with pytest.raises(ValueError): ms.partitions with pytest.raises(ValueError): ms.stored_path with pytest.raises(ValueError): ms.get_result_for_partition(partition="non existing partition") assert ms.horizon == 1 ms.create_gridsearch( n_splits=1, prophet_models=True, sklearn_models=False, sklearn_models_optimize_for_horizon=False, autosarimax_models=False, tbats_models=False, exp_smooth_models=False, average_ensembles=False, stacking_ensembles=False, exog_cols=["Raining"], ) assert hasattr(ms, "grid_search") assert isinstance(ms.grid_search.estimator.named_steps["holiday"], HolidayTransformer) ms.add_model_to_gridsearch(get_sklearn_wrapper(LinearRegression)) ms.select_model( df=df, target_col_name=target_col_name, partition_columns=["Region", "Plant", "Product"], ) assert len(ms.results) == n_regions * n_plants * n_products assert len(ms.partitions) == n_regions * n_plants * n_products ms.persist_results(persist_path) print(ms.partitions) ms_load = load_model_selector(folder_path=persist_path) # we do not ensure the same order of results and partitions after loading, # thus checking they are all there assert all( [partition in ms_load.partitions for partition in ms.partitions]) # TODO redefine __eq__ for ModelSelectorResult to str(MSR).__dict__? assert all([ str(ms_load.get_result_for_partition(partition).__dict__) == str( ms.get_result_for_partition(partition).__dict__) for partition in ms.partitions ]) assert ms.horizon == ms_load.horizon assert ms.frequency == ms_load.frequency ms.plot_best_wrapper_classes() ms.plot_results() assert "ModelSelector" in repr(ms) assert "ModelSelectorResults" in repr(ms) assert "ModelSelectorResult" in repr(ms.results[0]) with pytest.raises(ValueError): ms.results[0].persist(attribute_name="non_existing_attribute") assert ms.results[0].cv_splits_overlap is False ms.results[0].plot_error()
def seleciona_modelo_horizonte3(dtf_train='', target_col='', seed=42, horizonte=1, exog_features_list='', lags=3, pack=''): #if horizonte<3: # lags=horizonte ms = ModelSelector( horizon=horizonte, frequency='D', country_code_column= None # 'country' --> deixar None, se nao ocorre erro de execucao ) ms.create_gridsearch( sklearn_models=False, n_splits=2, # 10 cross-validation splits between_split_lag=None, sklearn_models_optimize_for_horizon=False, autosarimax_models= False, # Autosarimax agora esta funcionando, com pmdarima=1.5.3 prophet_models= False, # Nao ativar, pois usaremos o NeuralProphet em seu lugar tbats_models=False, # TBATS funcionando OK (pip install tbats) exp_smooth_models=False, # exp_smooth funcionando OK average_ensembles=False, # average_ensembles, funcionando OK stacking_ensembles= False, # Nao vamos usar, demora muito e nao da bom resultado exog_cols=exog_features_list, # exog_cols=None, #holidays_days_before=2, #holidays_days_after=1, #holidays_bridge_days=True, ) #ms.add_model_to_gridsearch(NeuralProphetWrapper(exog_cols=exog_features.columns.tolist())) use_scikit = True huber, ridge, xgb_sq, xgb_hb = pack if use_scikit: if target_col in xgb_hb: xgb_r = get_sklearn_wrapper( XGBRegressor, lags=lags, objective="reg:pseudohubererror") # , random_state=seed)) #xgb_r = get_sklearn_wrapper(XGBRegressor,lags=lags,objective="reg:tweedie") # , random_state=seed)) xgb_r.name = 'XGBRegressor_Huber' ms.add_model_to_gridsearch(xgb_r) elif target_col in xgb_sq: xgb_r = get_sklearn_wrapper(XGBRegressor, lags=lags) # , random_state=seed)) #xgb_r = get_sklearn_wrapper(XGBRegressor,lags=lags,objective="reg:tweedie") # , random_state=seed)) xgb_r.name = 'XGBRegressor_Squared_Loss' ms.add_model_to_gridsearch(xgb_r) elif target_col in ridge: ridge_r = get_sklearn_wrapper(Ridge, random_state=seed, lags=lags) ridge_r.name = 'Ridge' ms.add_model_to_gridsearch(ridge_r) else: huber_r = get_sklearn_wrapper(HuberRegressor, max_iter=160) huber_r.name = 'Huber' ms.add_model_to_gridsearch(huber_r) # Method `select_model` is doing majority of the magic for you - it creates forecast for each combination of # columns specified in `partition_columns` and for each of the time series it will run grid_search mentioned # above. Optionally once can select list of columns over which the model selection will run in parallel using # prefect (`parallel_over_columns`). # Required format for data is Datetime index, unsuprisingly numerical column for `target_col_name` all other # columns except `partition_columns` will be used as exogenous variables - as additional features for modeling. ms.select_model( df=dtf_train, target_col_name=target_col, partition_columns=None, # parallel_over_columns=['Assortment'], # persist_model_selector_results=False, # output_path='my_results', # executor = LocalDaskExecutor(), ) ms.persist_results('results') #mlflow.log_metric("score", 0.75) # ============================ Train model ================================== result = ms.results[0] print('Model selection result: \n', str(result)) best_model = result.best_model return ms, best_model, result