def test_create_containers_2(self): # Test "_all" key for additional regressors. param_config = { "input_parameters": { "datetime_column_name": "date", "index_column_name": "date", }, "model_parameters": { "test_values": 5, "delta_training_percentage": 30, "prediction_lags": 10, "possible_transformations": "none", "models": "mockup", "main_accuracy_estimator": "mae", }, "additional_regressors": { "_all": "test_datasets/test_create_containers_extrareg_d.csv", } } ing_data = DataFrame({ "a": pandas.date_range('2000-01-01', periods=30), "b": np.arange(30, 60), "c": np.arange(60, 90) }) ing_data.set_index("a", inplace=True) ing_data = add_freq(ing_data, "D") timeseries_containers = create_timeseries_containers( ing_data, param_config) assert len(timeseries_containers) == 2 for container in timeseries_containers: assert container.models['mockup'].characteristics[ 'extra_regressors'] == "d"
def test_freq_not_set_on_not_datetime_index(self): # df has not a datetime index. Check that it is not touched. df = DataFrame(data={"A": [1, 2, 3], "B": [10, 20, 30]}) df.set_index("A", inplace=True) new_df = add_freq(df) assert df.equals(new_df)
def test_daily_freq_imposed(self): # df is daily; check if it is set, passing it. df = get_fake_df(10) df.index.freq = None new_df = add_freq(df, "D") assert df.equals(new_df) assert new_df.index.freq == "D"
def test_unclear_freq_set_daily(self): # df has no clear frequency. # Check if it is set daily. dates = [pd.Timestamp(datetime(year=2020, month=1, day=1, hour=10, minute=00)), pd.Timestamp(datetime(year=2020, month=1, day=3, hour=12, minute=21)), pd.Timestamp(datetime(year=2020, month=1, day=7, hour=13, minute=30)), pd.Timestamp(datetime(year=2020, month=1, day=19, hour=11, minute=32))] ts = pd.DataFrame(np.random.randn(4), index=dates) new_ts = add_freq(ts) assert new_ts.index.freq == "D"
def test_daily_freq_normalize(self): # df is daily, but with different hours. # Check if it is set so. dates = [pd.Timestamp(datetime(year=2020, month=1, day=1, hour=10, minute=00)), pd.Timestamp(datetime(year=2020, month=1, day=2, hour=12, minute=21)), pd.Timestamp(datetime(year=2020, month=1, day=3, hour=13, minute=30)), pd.Timestamp(datetime(year=2020, month=1, day=4, hour=11, minute=32))] ts = pd.DataFrame(np.random.randn(4), index=dates) new_ts = add_freq(ts) assert ts.iloc[0].equals(new_ts.iloc[0]) assert new_ts.index[0] == Timestamp('2020-01-01 00:00:00', freq='D') assert new_ts.index[1] == Timestamp('2020-01-02 00:00:00', freq='D') assert new_ts.index[2] == Timestamp('2020-01-03 00:00:00', freq='D') assert new_ts.index[3] == Timestamp('2020-01-04 00:00:00', freq='D') assert new_ts.index.freq == "D"
def test_create_containers_onlyvisual(self, xcorr): param_config = { "input_parameters": { "datetime_column_name": "date", "index_column_name": "date", }, } if xcorr: param_config["xcorr_parameters"] = { "xcorr_max_lags": 5, "xcorr_extra_regressor_threshold": 0.5, "xcorr_mode": "pearson", "xcorr_mode_target": "pearson" } ing_data = DataFrame({ "a": pandas.date_range('2000-01-01', periods=30), "b": np.arange(30, 60), "c": np.arange(60, 90) }) ing_data.set_index("a", inplace=True) ing_data = add_freq(ing_data, "D") timeseries_containers = create_timeseries_containers( ing_data, param_config) assert len(timeseries_containers) == 2 for container in timeseries_containers: name = container.timeseries_data.columns[0] assert container.models is None assert container.historical_prediction is None if xcorr: assert container.xcorr is not None else: assert container.xcorr is None assert container.timeseries_data.equals(ing_data[[name]])
def test_no_additional_regressors_found(self): # Check that no multivariate predictions are used if no additional regressors are available. ing_data = DataFrame({ "a": pandas.date_range('2000-01-01', periods=30), "b": np.arange(30, 60), "c": np.random.randint(60, 90, 30) }) ing_data.set_index("a", inplace=True) ing_data = add_freq(ing_data, "D") param_config = { "xcorr_parameters": { "xcorr_max_lags": 5, "xcorr_extra_regressor_threshold": 1.01, # Pearson will be < this threshold. "xcorr_mode": "pearson", "xcorr_mode_target": "pearson" }, "input_parameters": {}, "model_parameters": { "test_values": 2, "delta_training_percentage": 20, "prediction_lags": 10, "possible_transformations": "log_modified,none", "models": "mockup", "main_accuracy_estimator": "mae", }, } # MockUp prediction models forecasts "0" if used in univariate mode, "number_of_extra_regressors" in # multivariate mode. timeseries_containers = create_timeseries_containers( ingested_data=ing_data, param_config=param_config) for i in pd.date_range(start="2000-01-31", end="2000-02-09", freq="D"): assert timeseries_containers[0].models[ 'mockup'].best_prediction.loc[i, "yhat"] == 0.0 assert timeseries_containers[1].models[ 'mockup'].best_prediction.loc[i, "yhat"] == 0.0
def compute(): param_file_nameJSON = 'configurations/configuration_test_covid19italy.json' # Load parameters from config file. with open( param_file_nameJSON) as json_file: # opening the config_file_name param_config = json.load(json_file) # loading the json # Logging log_level = getattr(logging, param_config["verbose"], None) if not isinstance(log_level, int): log_level = 0 # %(name)s for module name logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=log_level, stream=sys.stdout) # data ingestion log.info(f"Started data ingestion.") ingested_data = timexseries.data_ingestion.ingest_timeseries( param_config) # ingestion of data # data selection log.info(f"Started data selection.") ingested_data = select_timeseries_portion(ingested_data, param_config) # Custom columns log.info(f"Adding custom columns.") ingested_data["New cases/tests ratio"] = [ 100 * (np / tamp) for np, tamp in zip(ingested_data['Daily cases'], ingested_data['Daily tests']) ] # data prediction containers = create_timeseries_containers(ingested_data=ingested_data, param_config=param_config) #################################################################################################################### # Custom time-series ######### # If you are studying TIMEX code: you can ignore this. log.info(f"Computing the custom time-series.") regions = read_csv( "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv", header=0, index_col=0, usecols=['data', 'denominazione_regione', 'nuovi_positivi', 'tamponi']) regions.reset_index(inplace=True) regions['data'] = regions['data'].apply(lambda x: dateparser.parse(x)) regions.set_index(['data', 'denominazione_regione'], inplace=True, drop=True) regions = add_diff_columns(regions, ['tamponi'], group_by='denominazione_regione') regions.rename(columns={ 'nuovi_positivi': 'Daily cases', 'tamponi': 'Tests', "tamponi_diff": "Daily tests" }, inplace=True) regions["New cases/tests ratio"] = [ 100 * (ndc / tamp) if tamp > ndc > 0 else "nan" for ndc, tamp in zip(regions['Daily cases'], regions['Daily tests']) ] # Prediction of "New daily cases" for every region # We also want to plot cross-correlation with other regions. # So, create a dataFrame with only daily cases and regions as columns. regions_names = regions.index.get_level_values(1).unique() regions_names = regions_names.sort_values() datas = regions.index.get_level_values(0).unique().to_list() datas = datas[1:] # Abruzzo is missing the first day. cols = regions_names.to_list() cols = ['data'] + cols daily_cases_regions = DataFrame(columns=cols, dtype=numpy.float64) daily_cases_regions['data'] = datas daily_cases_regions.set_index(['data'], inplace=True, drop=True) for col in daily_cases_regions.columns: for i in daily_cases_regions.index: daily_cases_regions.loc[i][col] = regions.loc[i, col]['Daily cases'] daily_cases_regions = add_freq(daily_cases_regions, 'D') max_lags = param_config['xcorr_parameters']['xcorr_max_lags'] modes = [*param_config['xcorr_parameters']["xcorr_mode"].split(",")] try: max_threads = param_config['max_threads'] except KeyError: try: max_threads = len(os.sched_getaffinity(0)) except: max_threads = 1 for region in daily_cases_regions.columns: timeseries_data = daily_cases_regions[[region]] model_results = {} xcorr = calc_xcorr(region, daily_cases_regions, max_lags, modes) log.info(f"Computing univariate prediction for {region}...") predictor = FBProphetModel(param_config, transformation="none") prophet_result = predictor.launch_model(timeseries_data.copy(), max_threads=max_threads) model_results['fbprophet'] = prophet_result # # predictor = ARIMA(param_config) # arima_result = predictor.launch_model(scenario_data.copy()) # model_results.append(arima_result) s = TimeSeriesContainer(timeseries_data, model_results, xcorr) containers.append(s) # children_for_each_scenario.append({ # 'name': region, # 'children': create_scenario_children(s, param_config) # }) #################################################################################################################### # Save the computed data; these are the TimeSeriesContainer objects from which a nice Dash page can be built. # They can be loaded by "app_load_from_dump.py" to start the app # without re-computing all the data. with open(f"containers.pkl", 'wb') as input_file: pickle.dump(containers, input_file)
def test_freq_already_set(self): # df already has freq; do nothing. df = get_fake_df(10) new_df = add_freq(df) assert df.equals(new_df)
def test_compute_predictions_3(self, tmp_path): # Test with an historical predictions delta > 1 # This means that historical predictions are not computed starting from initial index 1-step ahead at time, # but they are computed every $delta time points. ing_data = DataFrame({ "a": pandas.date_range('2000-01-01', periods=30), "b": np.arange(30, 60), "c": np.arange(60, 90) }) ing_data.set_index("a", inplace=True) ing_data = add_freq(ing_data, "D") param_config = { "input_parameters": {}, "model_parameters": { "test_values": 2, "delta_training_percentage": 100, "prediction_lags": 10, "possible_transformations": "none", "models": "fbprophet,mockup", "main_accuracy_estimator": "mae", }, "historical_prediction_parameters": { "initial_index": "2000-01-20", "save_path": os.path.join(tmp_path, "test3.pkl"), "delta": 3 } } timeseries_containers = compute_historical_predictions( ingested_data=ing_data, param_config=param_config) assert len(timeseries_containers) == 2 assert timeseries_containers[0].timeseries_data.columns[0] == "b" assert timeseries_containers[1].timeseries_data.columns[0] == "c" assert len(timeseries_containers[0].models) == 2 assert len(timeseries_containers[1].models) == 2 for s in timeseries_containers: scen_name = s.timeseries_data.columns[0] for model in s.historical_prediction: hist_prediction = s.historical_prediction[model] assert len(hist_prediction) == 10 id = 0 for i in pandas.date_range('2000-01-21', periods=10): assert hist_prediction.index[id] == i id += 1 for endpoint in [ *pandas.date_range('2000-01-20', periods=4, freq="3d") ]: tr = ing_data.copy() fb_tr = tr.loc[:endpoint] fb_tr = fb_tr[[scen_name]] fbmodel = Prophet() fb_tr.reset_index(inplace=True) fb_tr.columns = ['ds', 'y'] with suppress_stdout_stderr(): fbmodel.fit(fb_tr) future_df = pd.DataFrame(index=pd.date_range( freq="1d", start=endpoint + pandas.Timedelta(days=1), periods=3), columns=["yhat"]) future = future_df.reset_index() future.rename(columns={'index': 'ds'}, inplace=True) forecast = fbmodel.predict(future) forecast.set_index('ds', inplace=True) expected_hist_pred = forecast.loc[:, 'yhat'] expected_hist_pred = expected_hist_pred.astype(object) expected_hist_pred.rename(scen_name, inplace=True) if endpoint == pd.Timestamp( '2000-01-29 00:00:00' ): # Last point, remove last 2 points expected_hist_pred = expected_hist_pred.iloc[0:1] computed_hist_pred = s.historical_prediction['fbprophet'].loc[ endpoint + pandas.Timedelta(days=1):endpoint + pandas.Timedelta(days=3), scen_name] assert expected_hist_pred.equals(computed_hist_pred)
def test_compute_predictions_2(self, tmp_path): ing_data = pd.read_csv("test_datasets/test_covid.csv") ing_data["data"] = ing_data["data"].apply( lambda x: dateparser.parse(x)) ing_data.set_index("data", inplace=True, drop=True) ing_data = add_freq(ing_data, "D") param_config = { "input_parameters": {}, "model_parameters": { "test_values": 5, "delta_training_percentage": 30, "prediction_lags": 10, "possible_transformations": "none", "models": "fbprophet", "main_accuracy_estimator": "mae", }, "historical_prediction_parameters": { "initial_index": "2020-12-08", "save_path": os.path.join(tmp_path, "test2.pkl") } } # You can verify with this code that tr_1 is the best training window. # test_values = 5 # tr_1 = ing_data.copy().iloc[-35:-5][['nuovi_positivi']] # tr_2 = ing_data.copy().iloc[-65:-5][['nuovi_positivi']] # tr_3 = ing_data.copy().iloc[-95:-5][['nuovi_positivi']] # tr_4 = ing_data.copy().iloc[0:-5][['nuovi_positivi']] # tr_sets = [tr_1, tr_2, tr_3, tr_4] # testing_df = ing_data.copy().iloc[-5:]['nuovi_positivi'] # # for tr in tr_sets: # fb_tr = tr.copy() # fbmodel = Prophet() # fb_tr.reset_index(inplace=True) # fb_tr.columns = ['ds', 'y'] # # with suppress_stdout_stderr(): # fbmodel.fit(fb_tr) # # future_df = pd.DataFrame(index=pd.date_range(freq="1d", # start=tr.index.values[0], # periods=len(tr) + test_values + 10), # columns=["yhat"], dtype=tr.iloc[:, 0].dtype) # # future = future_df.reset_index() # future.rename(columns={'index': 'ds'}, inplace=True) # # forecast = fbmodel.predict(future) # # forecast.set_index('ds', inplace=True) # # testing_prediction = forecast.iloc[-15:-10]['yhat'] # print(mean_absolute_error(testing_df['nuovi_positivi'], testing_prediction)) # The best tr is tr_1. Compute historical predictions. tr_1 = ing_data.copy().iloc[-35:][['nuovi_positivi']] fb_tr = tr_1.copy() fbmodel = Prophet() fb_tr.reset_index(inplace=True) fb_tr.columns = ['ds', 'y'] with suppress_stdout_stderr(): fbmodel.fit(fb_tr) future_df = pd.DataFrame(index=pd.date_range( freq="1d", start=tr_1.index.values[0], periods=len(tr_1) + 10), columns=["yhat"], dtype=tr_1.iloc[:, 0].dtype) future = future_df.reset_index() future.rename(columns={'index': 'ds'}, inplace=True) forecast = fbmodel.predict(future) forecast.set_index('ds', inplace=True) historical_prediction = forecast[['yhat']] # Let TIMEX do this thing. timeseries_containers = compute_historical_predictions( ingested_data=ing_data, param_config=param_config) timeseries_container = timeseries_containers[1] training_results = timeseries_container.models['fbprophet'].results training_results.sort( key=lambda x: getattr(x.testing_performances, 'MAE')) assert historical_prediction.equals( timeseries_container.models['fbprophet'].best_prediction[['yhat']])
def test_compute_predictions(self, tmp_path): # Check results are in the correct form and test the function to save historic predictions to file. # Delta will be 1, by default. ing_data = DataFrame({ "a": pandas.date_range('2000-01-01', periods=30), "b": np.arange(30, 60), "c": np.arange(60, 90) }) ing_data.set_index("a", inplace=True) ing_data = add_freq(ing_data, "D") param_config = { "xcorr_parameters": { "xcorr_max_lags": 120, "xcorr_extra_regressor_threshold": 0.8, "xcorr_mode": "pearson", "xcorr_mode_target": "pearson" }, "input_parameters": {}, "model_parameters": { "test_values": 2, "delta_training_percentage": 20, "prediction_lags": 10, "possible_transformations": "log_modified,none", "models": "mockup,fbprophet", "main_accuracy_estimator": "mae", }, "historical_prediction_parameters": { "initial_index": "2000-01-28", "save_path": os.path.join(tmp_path, "test1.pkl") } } timeseries_containers = compute_historical_predictions( ingested_data=ing_data, param_config=param_config) assert len(timeseries_containers) == 2 assert timeseries_containers[0].timeseries_data.columns[0] == "b" assert timeseries_containers[1].timeseries_data.columns[0] == "c" assert len(timeseries_containers[0].models) == 2 assert len(timeseries_containers[1].models) == 2 b_old_hist = timeseries_containers[0].historical_prediction c_old_hist = timeseries_containers[1].historical_prediction for s in timeseries_containers: for model in s.historical_prediction: hist_prediction = s.historical_prediction[model] assert len(hist_prediction) == 2 assert hist_prediction.index[0] == pandas.to_datetime( '2000-01-29', format="%Y-%m-%d") assert hist_prediction.index[1] == pandas.to_datetime( '2000-01-30', format="%Y-%m-%d") # Simulate a 1-step ahead in time, so we have collected a new point. # Note that past values are changed as well, so we will check that TIMEX does not change the old predictions. ing_data = DataFrame({ "a": pandas.date_range('2000-01-01', periods=31), "b": np.arange(20, 51), "c": np.arange(35, 66) }) ing_data.set_index("a", inplace=True) ing_data = add_freq(ing_data, "D") # This time historical predictions will be loaded from file. timeseries_containers = compute_historical_predictions( ingested_data=ing_data, param_config=param_config) for s in timeseries_containers: for model in s.historical_prediction: hist_prediction = s.historical_prediction[model] assert len(hist_prediction) == 3 assert hist_prediction.index[0] == pandas.to_datetime( '2000-01-29', format="%Y-%m-%d") assert hist_prediction.index[1] == pandas.to_datetime( '2000-01-30', format="%Y-%m-%d") assert hist_prediction.index[2] == pandas.to_datetime( '2000-01-31', format="%Y-%m-%d") # Check that past predictions have not been touched. assert b_old_hist['fbprophet'].iloc[0, 0] == timeseries_containers[ 0].historical_prediction['fbprophet'].iloc[0, 0] assert b_old_hist['fbprophet'].iloc[1, 0] == timeseries_containers[ 0].historical_prediction['fbprophet'].iloc[1, 0] assert b_old_hist['mockup'].iloc[0, 0] == timeseries_containers[ 0].historical_prediction['mockup'].iloc[0, 0] assert b_old_hist['mockup'].iloc[1, 0] == timeseries_containers[ 0].historical_prediction['mockup'].iloc[1, 0] assert c_old_hist['fbprophet'].iloc[0, 0] == timeseries_containers[ 1].historical_prediction['fbprophet'].iloc[0, 0] assert c_old_hist['fbprophet'].iloc[1, 0] == timeseries_containers[ 1].historical_prediction['fbprophet'].iloc[1, 0] assert c_old_hist['mockup'].iloc[0, 0] == timeseries_containers[ 1].historical_prediction['mockup'].iloc[0, 0] assert c_old_hist['mockup'].iloc[1, 0] == timeseries_containers[ 1].historical_prediction['mockup'].iloc[1, 0]
def test_get_best_univariate_and_multivariate_predictions(self): # Check that results are in the correct form. ing_data = DataFrame({ "a": pandas.date_range('1/1/2000', periods=30), "b": np.arange(30, 60), "c": np.arange(60, 90) }) ing_data.set_index("a", inplace=True) ing_data = add_freq(ing_data, "D") param_config = { "xcorr_parameters": { "xcorr_max_lags": 120, "xcorr_extra_regressor_threshold": 0.0, # Force predictor to use extra-regressors "xcorr_mode": "pearson", "xcorr_mode_target": "pearson" }, "model_parameters": { "test_values": 2, "delta_training_percentage": 20, "prediction_lags": 10, "possible_transformations": "log_modified,none", "models": "fbprophet,mockup", "main_accuracy_estimator": "mae", } } total_xcorr = calc_all_xcorr(ingested_data=ing_data, param_config=param_config) best_transformations, timeseries_containers = get_best_univariate_predictions( ing_data, param_config, total_xcorr) assert len(best_transformations) == 2 assert best_transformations["fbprophet"]["b"] in [ "log_modified", "none" ] assert best_transformations["fbprophet"]["c"] in [ "log_modified", "none" ] assert best_transformations["mockup"]["b"] in ["log_modified", "none"] assert best_transformations["mockup"]["c"] in ["log_modified", "none"] # Small trick: fool TIMEX in thinking that none is the best transformation for MockUp model. This way # we can check its predictions, which are hardcoded and always 0.0 for univariate and len(extra_regressors) for # multivariate... with log_modified values would not be exactly len(extra_regressors). best_transformations["mockup"]["b"] = "none" best_transformations["mockup"]["c"] = "none" assert len(timeseries_containers) == 2 assert timeseries_containers[0].timeseries_data.columns[0] == "b" assert timeseries_containers[1].timeseries_data.columns[0] == "c" assert len(timeseries_containers[0].models) == 2 assert len(timeseries_containers[1].models) == 2 assert timeseries_containers[0].models['mockup'].best_prediction.iloc[ -1, 0] == 0.0 # Check predictions are univariate assert timeseries_containers[1].models['mockup'].best_prediction.iloc[ -1, 0] == 0.0 timeseries_containers = get_best_multivariate_predictions( best_transformations=best_transformations, ingested_data=ing_data, timeseries_containers=timeseries_containers, param_config=param_config, total_xcorr=total_xcorr) assert len(timeseries_containers) == 2 assert timeseries_containers[0].timeseries_data.columns[0] == "b" assert timeseries_containers[1].timeseries_data.columns[0] == "c" assert timeseries_containers[0].models['mockup'].best_prediction.iloc[ -1, 0] == 1.0 # Check predictions are multivariate assert timeseries_containers[1].models['mockup'].best_prediction.iloc[ -1, 0] == 1.0 assert len(timeseries_containers[0].models) == 2 assert len(timeseries_containers[1].models) == 2
def test_create_containers(self, historical_predictions, xcorr, additional_regressors, expected_extra_regressors, expected_value, tmp_path): param_config = { "input_parameters": { "datetime_column_name": "date", "index_column_name": "date", }, "model_parameters": { "test_values": 5, "delta_training_percentage": 30, "prediction_lags": 10, "possible_transformations": "none", "models": "mockup", "main_accuracy_estimator": "mae", }, } if historical_predictions: param_config["historical_prediction_parameters"] = { "initial_index": "2000-01-15", "save_path": os.path.join(tmp_path, "test_create_containers.pkl") } if xcorr: param_config["xcorr_parameters"] = { "xcorr_max_lags": 5, "xcorr_extra_regressor_threshold": 0.0, # Force the predictor to use it "xcorr_mode": "pearson", "xcorr_mode_target": "pearson" } if additional_regressors: param_config["additional_regressors"] = { "b": "test_datasets/test_create_containers_extrareg_d.csv", "c": "test_datasets/test_create_containers_extrareg_e.csv", } # Having values like 30 -> 60 or 60 -> 90 will make multivariate Mockup model always win on the univariate one # because it will return the number of used extra-regressors (the more the lower MAE). ing_data = DataFrame({ "a": pandas.date_range('2000-01-01', periods=30), "b": np.arange(30, 60), "c": np.arange(60, 90) }) ing_data.set_index("a", inplace=True) ing_data = add_freq(ing_data, "D") timeseries_containers = create_timeseries_containers( ing_data, param_config) assert len(timeseries_containers) == 2 for container in timeseries_containers: name = container.timeseries_data.columns[0] if xcorr: assert type(container.xcorr) == dict if expected_extra_regressors != {}: assert container.models['mockup'].characteristics[ 'extra_regressors'] == expected_extra_regressors[name] if historical_predictions: hp = container.historical_prediction['mockup'] assert hp.loc[ pandas.to_datetime('2000-01-15', format="%Y-%m-%d"):, name].all() == expected_value else: assert container.historical_prediction is None