예제 #1
0
def test_model_selector_holidays(country_code_column, country_code,
                                 holiday_step, error):
    ms = ModelSelector(frequency="D",
                       horizon=1,
                       country_code_column=country_code_column)

    if error is not None:
        with pytest.raises(error):
            ms.create_gridsearch(country_code=country_code)
    else:
        ms.create_gridsearch(country_code=country_code)
        assert isinstance(ms.grid_search.estimator.named_steps["holiday"],
                          holiday_step)

        if holiday_step is not str:
            country_codes = [country_code] if isinstance(
                country_code, str) else (country_code or [])
            country_code_columns = ([country_code_column] if isinstance(
                country_code_column, str) else (country_code_column or []))

            assert all([
                isinstance(step[1], HolidayTransformer) for step in
                ms.grid_search.estimator.named_steps["holiday"].steps
            ])
            assert all([
                step[1].country_code == code for step, code in zip(
                    ms.grid_search.estimator.named_steps["holiday"].steps,
                    country_codes)
            ])
            assert all([
                step[1].country_code_column == col for step, col in zip(
                    ms.grid_search.estimator.named_steps["holiday"].steps,
                    country_code_columns)
            ])
예제 #2
0
def test_model_selector(tmp_path):

    n_regions = 1
    n_plants = 1
    n_products = 2
    target_col_name = "Quantity"
    persist_path = os.path.join(tmp_path, "results")

    df = generate_multiple_tsdata(
        n_dates=200, n_regions=n_regions, n_plants=n_plants, n_products=n_products
    )
    ms = ModelSelector(frequency="D", horizon=1, country_code_column="Country")
    assert ms.horizon == 1
    ms.create_gridsearch(
        n_splits=1,
        prophet_models=True,
        sklearn_models=False,
        sklearn_models_optimize_for_horizon=False,
        autosarimax_models=False,
        tbats_models=False,
        exp_smooth_models=False,
        average_ensembles=False,
        stacking_ensembles=False,
        exog_cols=["Raining"],
    )
    assert hasattr(ms, "grid_search")
    ms.add_model_to_gridsearch(get_sklearn_wrapper(LinearRegression))
    ms.select_model(
        df=df,
        target_col_name=target_col_name,
        partition_columns=["Region", "Plant", "Product"],
    )

    assert len(ms.results) == n_regions * n_plants * n_products
    assert len(ms.partitions) == n_regions * n_plants * n_products

    ms.persist_results(persist_path)

    print(ms.partitions)

    ms_load = load_model_selector(folder_path=persist_path)

    # we do not ensure the same order of results and partitions after loading, thus checking they are all there
    assert all([partition in ms_load.partitions for partition in ms.partitions])
    # TODO redefine __eq__ for ModelSelectorResult to str(MSR).__dict__?
    assert all(
        [
            str(ms_load.get_result_for_partition(partition).__dict__)
            == str(ms.get_result_for_partition(partition).__dict__)
            for partition in ms.partitions
        ]
    )
    assert ms.horizon == ms_load.horizon
    assert ms.frequency == ms_load.frequency
예제 #3
0
def test_model_selector_holidays(country_code_column, country_code,
                                 holiday_step, error):
    ms = ModelSelector(frequency="D",
                       horizon=1,
                       country_code_column=country_code_column)

    if error is not None:
        with pytest.raises(error):
            ms.create_gridsearch(country_code=country_code)
    else:
        ms.create_gridsearch(country_code=country_code)
        assert isinstance(ms.grid_search.estimator.named_steps["holiday"],
                          holiday_step)

        if holiday_step is not str:
            assert ms.grid_search.estimator.named_steps[
                "holiday"].country_code == country_code
            assert ms.grid_search.estimator.named_steps[
                "holiday"].country_code_column == country_code_column
예제 #4
0
def test_model_selector(tmp_path):

    n_regions = 1
    n_plants = 1
    n_products = 2
    target_col_name = "Quantity"
    persist_path = os.path.join(tmp_path, "results")

    df = generate_multiple_tsdata(n_dates=200,
                                  n_regions=n_regions,
                                  n_plants=n_plants,
                                  n_products=n_products)
    ms = ModelSelector(frequency="D", horizon=1, country_code_column="Country")

    with pytest.raises(ValueError):
        ms.results
    with pytest.raises(ValueError):
        ms.partitions
    with pytest.raises(ValueError):
        ms.stored_path
    with pytest.raises(ValueError):
        ms.get_result_for_partition(partition="non existing partition")
    assert ms.horizon == 1

    ms.create_gridsearch(
        n_splits=1,
        prophet_models=True,
        sklearn_models=False,
        sklearn_models_optimize_for_horizon=False,
        autosarimax_models=False,
        tbats_models=False,
        exp_smooth_models=False,
        average_ensembles=False,
        stacking_ensembles=False,
        exog_cols=["Raining"],
    )
    assert hasattr(ms, "grid_search")
    assert isinstance(ms.grid_search.estimator.named_steps["holiday"],
                      HolidayTransformer)

    ms.add_model_to_gridsearch(get_sklearn_wrapper(LinearRegression))
    ms.select_model(
        df=df,
        target_col_name=target_col_name,
        partition_columns=["Region", "Plant", "Product"],
    )

    assert len(ms.results) == n_regions * n_plants * n_products
    assert len(ms.partitions) == n_regions * n_plants * n_products

    ms.persist_results(persist_path)

    print(ms.partitions)

    ms_load = load_model_selector(folder_path=persist_path)

    # we do not ensure the same order of results and partitions after loading,
    # thus checking they are all there
    assert all(
        [partition in ms_load.partitions for partition in ms.partitions])
    # TODO redefine __eq__ for ModelSelectorResult to str(MSR).__dict__?
    assert all([
        str(ms_load.get_result_for_partition(partition).__dict__) == str(
            ms.get_result_for_partition(partition).__dict__)
        for partition in ms.partitions
    ])
    assert ms.horizon == ms_load.horizon
    assert ms.frequency == ms_load.frequency

    ms.plot_best_wrapper_classes()
    ms.plot_results()
    assert "ModelSelector" in repr(ms)
    assert "ModelSelectorResults" in repr(ms)
    assert "ModelSelectorResult" in repr(ms.results[0])

    with pytest.raises(ValueError):
        ms.results[0].persist(attribute_name="non_existing_attribute")

    assert ms.results[0].cv_splits_overlap is False

    ms.results[0].plot_error()
예제 #5
0
def seleciona_modelo_horizonte3(dtf_train='',
                                target_col='',
                                seed=42,
                                horizonte=1,
                                exog_features_list='',
                                lags=3,
                                pack=''):
    #if horizonte<3:
    #   lags=horizonte

    ms = ModelSelector(
        horizon=horizonte,
        frequency='D',
        country_code_column=
        None  # 'country' --> deixar None, se nao ocorre erro de execucao
    )
    ms.create_gridsearch(
        sklearn_models=False,
        n_splits=2,  # 10 cross-validation splits
        between_split_lag=None,
        sklearn_models_optimize_for_horizon=False,
        autosarimax_models=
        False,  # Autosarimax agora esta funcionando, com pmdarima=1.5.3
        prophet_models=
        False,  # Nao ativar, pois usaremos o NeuralProphet em seu lugar
        tbats_models=False,  # TBATS funcionando OK (pip install tbats)
        exp_smooth_models=False,  # exp_smooth funcionando OK
        average_ensembles=False,  # average_ensembles, funcionando OK
        stacking_ensembles=
        False,  # Nao vamos usar, demora muito e nao da bom resultado
        exog_cols=exog_features_list,
        # exog_cols=None,
        #holidays_days_before=2,
        #holidays_days_after=1,
        #holidays_bridge_days=True,
    )
    #ms.add_model_to_gridsearch(NeuralProphetWrapper(exog_cols=exog_features.columns.tolist()))
    use_scikit = True

    huber, ridge, xgb_sq, xgb_hb = pack
    if use_scikit:
        if target_col in xgb_hb:
            xgb_r = get_sklearn_wrapper(
                XGBRegressor, lags=lags,
                objective="reg:pseudohubererror")  # , random_state=seed))
            #xgb_r = get_sklearn_wrapper(XGBRegressor,lags=lags,objective="reg:tweedie")  # , random_state=seed))
            xgb_r.name = 'XGBRegressor_Huber'
            ms.add_model_to_gridsearch(xgb_r)
        elif target_col in xgb_sq:
            xgb_r = get_sklearn_wrapper(XGBRegressor,
                                        lags=lags)  # , random_state=seed))
            #xgb_r = get_sklearn_wrapper(XGBRegressor,lags=lags,objective="reg:tweedie")  # , random_state=seed))
            xgb_r.name = 'XGBRegressor_Squared_Loss'
            ms.add_model_to_gridsearch(xgb_r)
        elif target_col in ridge:
            ridge_r = get_sklearn_wrapper(Ridge, random_state=seed, lags=lags)
            ridge_r.name = 'Ridge'
            ms.add_model_to_gridsearch(ridge_r)
        else:
            huber_r = get_sklearn_wrapper(HuberRegressor, max_iter=160)
            huber_r.name = 'Huber'
            ms.add_model_to_gridsearch(huber_r)
    # Method `select_model` is doing majority of the magic for you - it creates forecast for each combination of
    # columns specified in `partition_columns` and for each of the time series it will run grid_search mentioned
    # above. Optionally once can select list of columns over which the model selection will run in parallel using
    # prefect (`parallel_over_columns`).
    # Required format for data is Datetime index, unsuprisingly numerical column for `target_col_name` all other
    # columns except `partition_columns` will be used as exogenous variables - as additional features for modeling.
    ms.select_model(
        df=dtf_train,
        target_col_name=target_col,
        partition_columns=None,
        #                 parallel_over_columns=['Assortment'],
        #                 persist_model_selector_results=False,
        #                 output_path='my_results',
        #                 executor = LocalDaskExecutor(),
    )

    ms.persist_results('results')
    #mlflow.log_metric("score", 0.75)

    # ============================  Train model  ==================================
    result = ms.results[0]
    print('Model selection result: \n', str(result))
    best_model = result.best_model
    return ms, best_model, result