示例#1
0
    def test_create_containers_2(self):
        # Test "_all" key for additional regressors.
        param_config = {
            "input_parameters": {
                "datetime_column_name": "date",
                "index_column_name": "date",
            },
            "model_parameters": {
                "test_values": 5,
                "delta_training_percentage": 30,
                "prediction_lags": 10,
                "possible_transformations": "none",
                "models": "mockup",
                "main_accuracy_estimator": "mae",
            },
            "additional_regressors": {
                "_all": "test_datasets/test_create_containers_extrareg_d.csv",
            }
        }

        ing_data = DataFrame({
            "a": pandas.date_range('2000-01-01', periods=30),
            "b": np.arange(30, 60),
            "c": np.arange(60, 90)
        })
        ing_data.set_index("a", inplace=True)
        ing_data = add_freq(ing_data, "D")

        timeseries_containers = create_timeseries_containers(
            ing_data, param_config)
        assert len(timeseries_containers) == 2
        for container in timeseries_containers:
            assert container.models['mockup'].characteristics[
                'extra_regressors'] == "d"
示例#2
0
    def test_freq_not_set_on_not_datetime_index(self):
        # df has not a datetime index. Check that it is not touched.
        df = DataFrame(data={"A": [1, 2, 3], "B": [10, 20, 30]})
        df.set_index("A", inplace=True)
        new_df = add_freq(df)

        assert df.equals(new_df)
示例#3
0
    def test_daily_freq_imposed(self):
        # df is daily; check if it is set, passing it.
        df = get_fake_df(10)
        df.index.freq = None

        new_df = add_freq(df, "D")
        assert df.equals(new_df)
        assert new_df.index.freq == "D"
示例#4
0
    def test_unclear_freq_set_daily(self):
        # df has no clear frequency.
        # Check if it is set daily.
        dates = [pd.Timestamp(datetime(year=2020, month=1, day=1, hour=10, minute=00)),
                 pd.Timestamp(datetime(year=2020, month=1, day=3, hour=12, minute=21)),
                 pd.Timestamp(datetime(year=2020, month=1, day=7, hour=13, minute=30)),
                 pd.Timestamp(datetime(year=2020, month=1, day=19, hour=11, minute=32))]

        ts = pd.DataFrame(np.random.randn(4), index=dates)

        new_ts = add_freq(ts)
        assert new_ts.index.freq == "D"
示例#5
0
    def test_daily_freq_normalize(self):
        # df is daily, but with different hours.
        # Check if it is set so.
        dates = [pd.Timestamp(datetime(year=2020, month=1, day=1, hour=10, minute=00)),
                 pd.Timestamp(datetime(year=2020, month=1, day=2, hour=12, minute=21)),
                 pd.Timestamp(datetime(year=2020, month=1, day=3, hour=13, minute=30)),
                 pd.Timestamp(datetime(year=2020, month=1, day=4, hour=11, minute=32))]

        ts = pd.DataFrame(np.random.randn(4), index=dates)

        new_ts = add_freq(ts)
        assert ts.iloc[0].equals(new_ts.iloc[0])
        assert new_ts.index[0] == Timestamp('2020-01-01 00:00:00', freq='D')
        assert new_ts.index[1] == Timestamp('2020-01-02 00:00:00', freq='D')
        assert new_ts.index[2] == Timestamp('2020-01-03 00:00:00', freq='D')
        assert new_ts.index[3] == Timestamp('2020-01-04 00:00:00', freq='D')
        assert new_ts.index.freq == "D"
示例#6
0
    def test_create_containers_onlyvisual(self, xcorr):

        param_config = {
            "input_parameters": {
                "datetime_column_name": "date",
                "index_column_name": "date",
            },
        }

        if xcorr:
            param_config["xcorr_parameters"] = {
                "xcorr_max_lags": 5,
                "xcorr_extra_regressor_threshold": 0.5,
                "xcorr_mode": "pearson",
                "xcorr_mode_target": "pearson"
            }

        ing_data = DataFrame({
            "a": pandas.date_range('2000-01-01', periods=30),
            "b": np.arange(30, 60),
            "c": np.arange(60, 90)
        })
        ing_data.set_index("a", inplace=True)
        ing_data = add_freq(ing_data, "D")

        timeseries_containers = create_timeseries_containers(
            ing_data, param_config)

        assert len(timeseries_containers) == 2
        for container in timeseries_containers:
            name = container.timeseries_data.columns[0]
            assert container.models is None
            assert container.historical_prediction is None
            if xcorr:
                assert container.xcorr is not None
            else:
                assert container.xcorr is None
            assert container.timeseries_data.equals(ing_data[[name]])
示例#7
0
    def test_no_additional_regressors_found(self):
        # Check that no multivariate predictions are used if no additional regressors are available.
        ing_data = DataFrame({
            "a": pandas.date_range('2000-01-01', periods=30),
            "b": np.arange(30, 60),
            "c": np.random.randint(60, 90, 30)
        })
        ing_data.set_index("a", inplace=True)
        ing_data = add_freq(ing_data, "D")

        param_config = {
            "xcorr_parameters": {
                "xcorr_max_lags": 5,
                "xcorr_extra_regressor_threshold":
                1.01,  # Pearson will be < this threshold.
                "xcorr_mode": "pearson",
                "xcorr_mode_target": "pearson"
            },
            "input_parameters": {},
            "model_parameters": {
                "test_values": 2,
                "delta_training_percentage": 20,
                "prediction_lags": 10,
                "possible_transformations": "log_modified,none",
                "models": "mockup",
                "main_accuracy_estimator": "mae",
            },
        }

        # MockUp prediction models forecasts "0" if used in univariate mode, "number_of_extra_regressors" in
        # multivariate mode.
        timeseries_containers = create_timeseries_containers(
            ingested_data=ing_data, param_config=param_config)
        for i in pd.date_range(start="2000-01-31", end="2000-02-09", freq="D"):
            assert timeseries_containers[0].models[
                'mockup'].best_prediction.loc[i, "yhat"] == 0.0
            assert timeseries_containers[1].models[
                'mockup'].best_prediction.loc[i, "yhat"] == 0.0
示例#8
0
def compute():

    param_file_nameJSON = 'configurations/configuration_test_covid19italy.json'

    # Load parameters from config file.
    with open(
            param_file_nameJSON) as json_file:  # opening the config_file_name
        param_config = json.load(json_file)  # loading the json

    # Logging
    log_level = getattr(logging, param_config["verbose"], None)
    if not isinstance(log_level, int):
        log_level = 0
    # %(name)s for module name
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',
                        level=log_level,
                        stream=sys.stdout)

    # data ingestion
    log.info(f"Started data ingestion.")
    ingested_data = timexseries.data_ingestion.ingest_timeseries(
        param_config)  # ingestion of data

    # data selection
    log.info(f"Started data selection.")
    ingested_data = select_timeseries_portion(ingested_data, param_config)

    # Custom columns
    log.info(f"Adding custom columns.")
    ingested_data["New cases/tests ratio"] = [
        100 * (np / tamp) for np, tamp in zip(ingested_data['Daily cases'],
                                              ingested_data['Daily tests'])
    ]

    # data prediction
    containers = create_timeseries_containers(ingested_data=ingested_data,
                                              param_config=param_config)

    ####################################################################################################################
    # Custom time-series #########
    # If you are studying TIMEX code: you can ignore this.
    log.info(f"Computing the custom time-series.")

    regions = read_csv(
        "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv",
        header=0,
        index_col=0,
        usecols=['data', 'denominazione_regione', 'nuovi_positivi', 'tamponi'])
    regions.reset_index(inplace=True)
    regions['data'] = regions['data'].apply(lambda x: dateparser.parse(x))
    regions.set_index(['data', 'denominazione_regione'],
                      inplace=True,
                      drop=True)

    regions = add_diff_columns(regions, ['tamponi'],
                               group_by='denominazione_regione')

    regions.rename(columns={
        'nuovi_positivi': 'Daily cases',
        'tamponi': 'Tests',
        "tamponi_diff": "Daily tests"
    },
                   inplace=True)

    regions["New cases/tests ratio"] = [
        100 * (ndc / tamp) if tamp > ndc > 0 else "nan"
        for ndc, tamp in zip(regions['Daily cases'], regions['Daily tests'])
    ]

    # Prediction of "New daily cases" for every region
    # We also want to plot cross-correlation with other regions.
    # So, create a dataFrame with only daily cases and regions as columns.
    regions_names = regions.index.get_level_values(1).unique()
    regions_names = regions_names.sort_values()

    datas = regions.index.get_level_values(0).unique().to_list()
    datas = datas[1:]  # Abruzzo is missing the first day.

    cols = regions_names.to_list()
    cols = ['data'] + cols

    daily_cases_regions = DataFrame(columns=cols, dtype=numpy.float64)
    daily_cases_regions['data'] = datas

    daily_cases_regions.set_index(['data'], inplace=True, drop=True)

    for col in daily_cases_regions.columns:
        for i in daily_cases_regions.index:
            daily_cases_regions.loc[i][col] = regions.loc[i,
                                                          col]['Daily cases']

    daily_cases_regions = add_freq(daily_cases_regions, 'D')

    max_lags = param_config['xcorr_parameters']['xcorr_max_lags']
    modes = [*param_config['xcorr_parameters']["xcorr_mode"].split(",")]
    try:
        max_threads = param_config['max_threads']
    except KeyError:
        try:
            max_threads = len(os.sched_getaffinity(0))
        except:
            max_threads = 1

    for region in daily_cases_regions.columns:
        timeseries_data = daily_cases_regions[[region]]

        model_results = {}

        xcorr = calc_xcorr(region, daily_cases_regions, max_lags, modes)

        log.info(f"Computing univariate prediction for {region}...")
        predictor = FBProphetModel(param_config, transformation="none")
        prophet_result = predictor.launch_model(timeseries_data.copy(),
                                                max_threads=max_threads)
        model_results['fbprophet'] = prophet_result
        #
        # predictor = ARIMA(param_config)
        # arima_result = predictor.launch_model(scenario_data.copy())
        # model_results.append(arima_result)

        s = TimeSeriesContainer(timeseries_data, model_results, xcorr)
        containers.append(s)

        # children_for_each_scenario.append({
        #     'name': region,
        #     'children': create_scenario_children(s, param_config)
        # })

    ####################################################################################################################

    # Save the computed data; these are the TimeSeriesContainer objects from which a nice Dash page can be built.
    # They can be loaded by "app_load_from_dump.py" to start the app
    # without re-computing all the data.
    with open(f"containers.pkl", 'wb') as input_file:
        pickle.dump(containers, input_file)
示例#9
0
    def test_freq_already_set(self):
        # df already has freq; do nothing.
        df = get_fake_df(10)
        new_df = add_freq(df)

        assert df.equals(new_df)
示例#10
0
    def test_compute_predictions_3(self, tmp_path):
        # Test with an historical predictions delta > 1
        # This means that historical predictions are not computed starting from initial index 1-step ahead at time,
        # but they are computed every $delta time points.
        ing_data = DataFrame({
            "a": pandas.date_range('2000-01-01', periods=30),
            "b": np.arange(30, 60),
            "c": np.arange(60, 90)
        })
        ing_data.set_index("a", inplace=True)
        ing_data = add_freq(ing_data, "D")

        param_config = {
            "input_parameters": {},
            "model_parameters": {
                "test_values": 2,
                "delta_training_percentage": 100,
                "prediction_lags": 10,
                "possible_transformations": "none",
                "models": "fbprophet,mockup",
                "main_accuracy_estimator": "mae",
            },
            "historical_prediction_parameters": {
                "initial_index": "2000-01-20",
                "save_path": os.path.join(tmp_path, "test3.pkl"),
                "delta": 3
            }
        }

        timeseries_containers = compute_historical_predictions(
            ingested_data=ing_data, param_config=param_config)

        assert len(timeseries_containers) == 2
        assert timeseries_containers[0].timeseries_data.columns[0] == "b"
        assert timeseries_containers[1].timeseries_data.columns[0] == "c"

        assert len(timeseries_containers[0].models) == 2
        assert len(timeseries_containers[1].models) == 2

        for s in timeseries_containers:
            scen_name = s.timeseries_data.columns[0]
            for model in s.historical_prediction:
                hist_prediction = s.historical_prediction[model]
                assert len(hist_prediction) == 10
                id = 0
                for i in pandas.date_range('2000-01-21', periods=10):
                    assert hist_prediction.index[id] == i
                    id += 1

            for endpoint in [
                    *pandas.date_range('2000-01-20', periods=4, freq="3d")
            ]:
                tr = ing_data.copy()
                fb_tr = tr.loc[:endpoint]
                fb_tr = fb_tr[[scen_name]]
                fbmodel = Prophet()
                fb_tr.reset_index(inplace=True)
                fb_tr.columns = ['ds', 'y']

                with suppress_stdout_stderr():
                    fbmodel.fit(fb_tr)

                future_df = pd.DataFrame(index=pd.date_range(
                    freq="1d",
                    start=endpoint + pandas.Timedelta(days=1),
                    periods=3),
                                         columns=["yhat"])
                future = future_df.reset_index()
                future.rename(columns={'index': 'ds'}, inplace=True)
                forecast = fbmodel.predict(future)
                forecast.set_index('ds', inplace=True)
                expected_hist_pred = forecast.loc[:, 'yhat']
                expected_hist_pred = expected_hist_pred.astype(object)
                expected_hist_pred.rename(scen_name, inplace=True)
                if endpoint == pd.Timestamp(
                        '2000-01-29 00:00:00'
                ):  # Last point, remove last 2 points
                    expected_hist_pred = expected_hist_pred.iloc[0:1]

                computed_hist_pred = s.historical_prediction['fbprophet'].loc[
                    endpoint + pandas.Timedelta(days=1):endpoint +
                    pandas.Timedelta(days=3), scen_name]

                assert expected_hist_pred.equals(computed_hist_pred)
示例#11
0
    def test_compute_predictions_2(self, tmp_path):

        ing_data = pd.read_csv("test_datasets/test_covid.csv")
        ing_data["data"] = ing_data["data"].apply(
            lambda x: dateparser.parse(x))
        ing_data.set_index("data", inplace=True, drop=True)
        ing_data = add_freq(ing_data, "D")

        param_config = {
            "input_parameters": {},
            "model_parameters": {
                "test_values": 5,
                "delta_training_percentage": 30,
                "prediction_lags": 10,
                "possible_transformations": "none",
                "models": "fbprophet",
                "main_accuracy_estimator": "mae",
            },
            "historical_prediction_parameters": {
                "initial_index": "2020-12-08",
                "save_path": os.path.join(tmp_path, "test2.pkl")
            }
        }

        # You can verify with this code that tr_1 is the best training window.
        # test_values = 5
        # tr_1 = ing_data.copy().iloc[-35:-5][['nuovi_positivi']]
        # tr_2 = ing_data.copy().iloc[-65:-5][['nuovi_positivi']]
        # tr_3 = ing_data.copy().iloc[-95:-5][['nuovi_positivi']]
        # tr_4 = ing_data.copy().iloc[0:-5][['nuovi_positivi']]

        # tr_sets = [tr_1, tr_2, tr_3, tr_4]
        # testing_df = ing_data.copy().iloc[-5:]['nuovi_positivi']
        #
        # for tr in tr_sets:
        #     fb_tr = tr.copy()
        #     fbmodel = Prophet()
        #     fb_tr.reset_index(inplace=True)
        #     fb_tr.columns = ['ds', 'y']
        #
        #     with suppress_stdout_stderr():
        #         fbmodel.fit(fb_tr)
        #
        #     future_df = pd.DataFrame(index=pd.date_range(freq="1d",
        #                                                  start=tr.index.values[0],
        #                                                  periods=len(tr) + test_values + 10),
        #                              columns=["yhat"], dtype=tr.iloc[:, 0].dtype)
        #
        #     future = future_df.reset_index()
        #     future.rename(columns={'index': 'ds'}, inplace=True)
        #
        #     forecast = fbmodel.predict(future)
        #
        #     forecast.set_index('ds', inplace=True)
        #
        #     testing_prediction = forecast.iloc[-15:-10]['yhat']
        #     print(mean_absolute_error(testing_df['nuovi_positivi'], testing_prediction))

        # The best tr is tr_1. Compute historical predictions.
        tr_1 = ing_data.copy().iloc[-35:][['nuovi_positivi']]
        fb_tr = tr_1.copy()
        fbmodel = Prophet()
        fb_tr.reset_index(inplace=True)
        fb_tr.columns = ['ds', 'y']

        with suppress_stdout_stderr():
            fbmodel.fit(fb_tr)

        future_df = pd.DataFrame(index=pd.date_range(
            freq="1d", start=tr_1.index.values[0], periods=len(tr_1) + 10),
                                 columns=["yhat"],
                                 dtype=tr_1.iloc[:, 0].dtype)
        future = future_df.reset_index()
        future.rename(columns={'index': 'ds'}, inplace=True)
        forecast = fbmodel.predict(future)
        forecast.set_index('ds', inplace=True)
        historical_prediction = forecast[['yhat']]

        # Let TIMEX do this thing.
        timeseries_containers = compute_historical_predictions(
            ingested_data=ing_data, param_config=param_config)

        timeseries_container = timeseries_containers[1]
        training_results = timeseries_container.models['fbprophet'].results
        training_results.sort(
            key=lambda x: getattr(x.testing_performances, 'MAE'))

        assert historical_prediction.equals(
            timeseries_container.models['fbprophet'].best_prediction[['yhat']])
示例#12
0
    def test_compute_predictions(self, tmp_path):
        # Check results are in the correct form and test the function to save historic predictions to file.
        # Delta will be 1, by default.
        ing_data = DataFrame({
            "a": pandas.date_range('2000-01-01', periods=30),
            "b": np.arange(30, 60),
            "c": np.arange(60, 90)
        })
        ing_data.set_index("a", inplace=True)
        ing_data = add_freq(ing_data, "D")

        param_config = {
            "xcorr_parameters": {
                "xcorr_max_lags": 120,
                "xcorr_extra_regressor_threshold": 0.8,
                "xcorr_mode": "pearson",
                "xcorr_mode_target": "pearson"
            },
            "input_parameters": {},
            "model_parameters": {
                "test_values": 2,
                "delta_training_percentage": 20,
                "prediction_lags": 10,
                "possible_transformations": "log_modified,none",
                "models": "mockup,fbprophet",
                "main_accuracy_estimator": "mae",
            },
            "historical_prediction_parameters": {
                "initial_index": "2000-01-28",
                "save_path": os.path.join(tmp_path, "test1.pkl")
            }
        }

        timeseries_containers = compute_historical_predictions(
            ingested_data=ing_data, param_config=param_config)

        assert len(timeseries_containers) == 2
        assert timeseries_containers[0].timeseries_data.columns[0] == "b"
        assert timeseries_containers[1].timeseries_data.columns[0] == "c"

        assert len(timeseries_containers[0].models) == 2
        assert len(timeseries_containers[1].models) == 2

        b_old_hist = timeseries_containers[0].historical_prediction
        c_old_hist = timeseries_containers[1].historical_prediction

        for s in timeseries_containers:
            for model in s.historical_prediction:
                hist_prediction = s.historical_prediction[model]
                assert len(hist_prediction) == 2
                assert hist_prediction.index[0] == pandas.to_datetime(
                    '2000-01-29', format="%Y-%m-%d")
                assert hist_prediction.index[1] == pandas.to_datetime(
                    '2000-01-30', format="%Y-%m-%d")

        # Simulate a 1-step ahead in time, so we have collected a new point.
        # Note that past values are changed as well, so we will check that TIMEX does not change the old predictions.
        ing_data = DataFrame({
            "a": pandas.date_range('2000-01-01', periods=31),
            "b": np.arange(20, 51),
            "c": np.arange(35, 66)
        })
        ing_data.set_index("a", inplace=True)
        ing_data = add_freq(ing_data, "D")

        # This time historical predictions will be loaded from file.
        timeseries_containers = compute_historical_predictions(
            ingested_data=ing_data, param_config=param_config)

        for s in timeseries_containers:
            for model in s.historical_prediction:
                hist_prediction = s.historical_prediction[model]
                assert len(hist_prediction) == 3
                assert hist_prediction.index[0] == pandas.to_datetime(
                    '2000-01-29', format="%Y-%m-%d")
                assert hist_prediction.index[1] == pandas.to_datetime(
                    '2000-01-30', format="%Y-%m-%d")
                assert hist_prediction.index[2] == pandas.to_datetime(
                    '2000-01-31', format="%Y-%m-%d")

        # Check that past predictions have not been touched.
        assert b_old_hist['fbprophet'].iloc[0, 0] == timeseries_containers[
            0].historical_prediction['fbprophet'].iloc[0, 0]
        assert b_old_hist['fbprophet'].iloc[1, 0] == timeseries_containers[
            0].historical_prediction['fbprophet'].iloc[1, 0]
        assert b_old_hist['mockup'].iloc[0, 0] == timeseries_containers[
            0].historical_prediction['mockup'].iloc[0, 0]
        assert b_old_hist['mockup'].iloc[1, 0] == timeseries_containers[
            0].historical_prediction['mockup'].iloc[1, 0]

        assert c_old_hist['fbprophet'].iloc[0, 0] == timeseries_containers[
            1].historical_prediction['fbprophet'].iloc[0, 0]
        assert c_old_hist['fbprophet'].iloc[1, 0] == timeseries_containers[
            1].historical_prediction['fbprophet'].iloc[1, 0]
        assert c_old_hist['mockup'].iloc[0, 0] == timeseries_containers[
            1].historical_prediction['mockup'].iloc[0, 0]
        assert c_old_hist['mockup'].iloc[1, 0] == timeseries_containers[
            1].historical_prediction['mockup'].iloc[1, 0]
示例#13
0
    def test_get_best_univariate_and_multivariate_predictions(self):
        # Check that results are in the correct form.

        ing_data = DataFrame({
            "a": pandas.date_range('1/1/2000', periods=30),
            "b": np.arange(30, 60),
            "c": np.arange(60, 90)
        })
        ing_data.set_index("a", inplace=True)
        ing_data = add_freq(ing_data, "D")

        param_config = {
            "xcorr_parameters": {
                "xcorr_max_lags": 120,
                "xcorr_extra_regressor_threshold":
                0.0,  # Force predictor to use extra-regressors
                "xcorr_mode": "pearson",
                "xcorr_mode_target": "pearson"
            },
            "model_parameters": {
                "test_values": 2,
                "delta_training_percentage": 20,
                "prediction_lags": 10,
                "possible_transformations": "log_modified,none",
                "models": "fbprophet,mockup",
                "main_accuracy_estimator": "mae",
            }
        }

        total_xcorr = calc_all_xcorr(ingested_data=ing_data,
                                     param_config=param_config)

        best_transformations, timeseries_containers = get_best_univariate_predictions(
            ing_data, param_config, total_xcorr)

        assert len(best_transformations) == 2
        assert best_transformations["fbprophet"]["b"] in [
            "log_modified", "none"
        ]
        assert best_transformations["fbprophet"]["c"] in [
            "log_modified", "none"
        ]
        assert best_transformations["mockup"]["b"] in ["log_modified", "none"]
        assert best_transformations["mockup"]["c"] in ["log_modified", "none"]

        # Small trick: fool TIMEX in thinking that none is the best transformation for MockUp model. This way
        # we can check its predictions, which are hardcoded and always 0.0 for univariate and len(extra_regressors) for
        # multivariate... with log_modified values would not be exactly len(extra_regressors).
        best_transformations["mockup"]["b"] = "none"
        best_transformations["mockup"]["c"] = "none"

        assert len(timeseries_containers) == 2
        assert timeseries_containers[0].timeseries_data.columns[0] == "b"
        assert timeseries_containers[1].timeseries_data.columns[0] == "c"

        assert len(timeseries_containers[0].models) == 2
        assert len(timeseries_containers[1].models) == 2

        assert timeseries_containers[0].models['mockup'].best_prediction.iloc[
            -1, 0] == 0.0  # Check predictions are univariate
        assert timeseries_containers[1].models['mockup'].best_prediction.iloc[
            -1, 0] == 0.0

        timeseries_containers = get_best_multivariate_predictions(
            best_transformations=best_transformations,
            ingested_data=ing_data,
            timeseries_containers=timeseries_containers,
            param_config=param_config,
            total_xcorr=total_xcorr)
        assert len(timeseries_containers) == 2
        assert timeseries_containers[0].timeseries_data.columns[0] == "b"
        assert timeseries_containers[1].timeseries_data.columns[0] == "c"

        assert timeseries_containers[0].models['mockup'].best_prediction.iloc[
            -1, 0] == 1.0  # Check predictions are multivariate
        assert timeseries_containers[1].models['mockup'].best_prediction.iloc[
            -1, 0] == 1.0

        assert len(timeseries_containers[0].models) == 2
        assert len(timeseries_containers[1].models) == 2
示例#14
0
    def test_create_containers(self, historical_predictions, xcorr,
                               additional_regressors,
                               expected_extra_regressors, expected_value,
                               tmp_path):

        param_config = {
            "input_parameters": {
                "datetime_column_name": "date",
                "index_column_name": "date",
            },
            "model_parameters": {
                "test_values": 5,
                "delta_training_percentage": 30,
                "prediction_lags": 10,
                "possible_transformations": "none",
                "models": "mockup",
                "main_accuracy_estimator": "mae",
            },
        }

        if historical_predictions:
            param_config["historical_prediction_parameters"] = {
                "initial_index": "2000-01-15",
                "save_path": os.path.join(tmp_path,
                                          "test_create_containers.pkl")
            }

        if xcorr:
            param_config["xcorr_parameters"] = {
                "xcorr_max_lags": 5,
                "xcorr_extra_regressor_threshold":
                0.0,  # Force the predictor to use it
                "xcorr_mode": "pearson",
                "xcorr_mode_target": "pearson"
            }

        if additional_regressors:
            param_config["additional_regressors"] = {
                "b": "test_datasets/test_create_containers_extrareg_d.csv",
                "c": "test_datasets/test_create_containers_extrareg_e.csv",
            }

        # Having values like 30 -> 60 or 60 -> 90 will make multivariate Mockup model always win on the univariate one
        # because it will return the number of used extra-regressors (the more the lower MAE).
        ing_data = DataFrame({
            "a": pandas.date_range('2000-01-01', periods=30),
            "b": np.arange(30, 60),
            "c": np.arange(60, 90)
        })
        ing_data.set_index("a", inplace=True)
        ing_data = add_freq(ing_data, "D")

        timeseries_containers = create_timeseries_containers(
            ing_data, param_config)

        assert len(timeseries_containers) == 2
        for container in timeseries_containers:
            name = container.timeseries_data.columns[0]

            if xcorr:
                assert type(container.xcorr) == dict

            if expected_extra_regressors != {}:
                assert container.models['mockup'].characteristics[
                    'extra_regressors'] == expected_extra_regressors[name]

            if historical_predictions:
                hp = container.historical_prediction['mockup']
                assert hp.loc[
                    pandas.to_datetime('2000-01-15', format="%Y-%m-%d"):,
                    name].all() == expected_value
            else:
                assert container.historical_prediction is None