def test_init_and_end_datetime(self, are_dateparser_options_specified): # Select rows using both init and end time. param_config = { "selection_parameters": { "init_datetime": "2000-01-02", "end_datetime": "2000-01-04" }, } if are_dateparser_options_specified: param_config["input_parameters"] = {} param_config["input_parameters"]["dateparser_options"] = { "date_formats": ["%Y-%m-%dT%H:%M:%S"] } df = get_fake_df(length=5) selected_df = select_timeseries_portion(df, param_config) assert selected_df.index.values[0] == Timestamp("2000-01-02") assert selected_df.index.values[1] == Timestamp("2000-01-03") assert selected_df.index.values[2] == Timestamp("2000-01-04") assert df.iloc[1]["value"] == selected_df.iloc[0]["value"] assert df.iloc[2]["value"] == selected_df.iloc[1]["value"] assert df.iloc[3]["value"] == selected_df.iloc[2]["value"] assert len(selected_df) == 3
def test_selection_not_requested(self): # Test if df is returned untouched if selection is not required. param_config = { "input_parameters": { "source_data_url": os.path.join("test_datasets", "test_6.csv"), } } df = ingest_timeseries(param_config) selected_df = select_timeseries_portion(df, param_config) assert df.equals(selected_df)
def test_end_datetime(self): # Select rows using end datetime. param_config = { "selection_parameters": { "init_datetime": "1999-01-02", "end_datetime": "2000-01-02" }, } df = get_fake_df(length=3) selected_df = select_timeseries_portion(df, param_config) assert selected_df.index.values[0] == Timestamp("2000-01-01") assert selected_df.index.values[1] == Timestamp("2000-01-02") assert df.iloc[0]["value"] == selected_df.iloc[0]["value"] assert df.iloc[1]["value"] == selected_df.iloc[1]["value"] assert len(selected_df) == 2
def test_select_on_values(self): # Select rows based on value. param_config = { "input_parameters": { "source_data_url": os.path.join("test_datasets", "test_1.csv"), "columns_to_load_from_url": "first_column,third_column", "datetime_column_name": "first_column", "index_column_name": "first_column", "frequency": "D" }, "selection_parameters": { "column_name_selection": "third_column", "value_selection": 3, }, } df = ingest_timeseries(param_config) df = select_timeseries_portion(df, param_config) assert df.index.values[0] == Timestamp("2020-02-25") assert df.iloc[0]["third_column"] == 3 assert len(df) == 1
def compute(): param_file_nameJSON = 'configurations/configuration_test_covid19italy.json' # Load parameters from config file. with open( param_file_nameJSON) as json_file: # opening the config_file_name param_config = json.load(json_file) # loading the json # Logging log_level = getattr(logging, param_config["verbose"], None) if not isinstance(log_level, int): log_level = 0 # %(name)s for module name logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=log_level, stream=sys.stdout) # data ingestion log.info(f"Started data ingestion.") ingested_data = timexseries.data_ingestion.ingest_timeseries( param_config) # ingestion of data # data selection log.info(f"Started data selection.") ingested_data = select_timeseries_portion(ingested_data, param_config) # Custom columns log.info(f"Adding custom columns.") ingested_data["New cases/tests ratio"] = [ 100 * (np / tamp) for np, tamp in zip(ingested_data['Daily cases'], ingested_data['Daily tests']) ] # data prediction containers = create_timeseries_containers(ingested_data=ingested_data, param_config=param_config) #################################################################################################################### # Custom time-series ######### # If you are studying TIMEX code: you can ignore this. log.info(f"Computing the custom time-series.") regions = read_csv( "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv", header=0, index_col=0, usecols=['data', 'denominazione_regione', 'nuovi_positivi', 'tamponi']) regions.reset_index(inplace=True) regions['data'] = regions['data'].apply(lambda x: dateparser.parse(x)) regions.set_index(['data', 'denominazione_regione'], inplace=True, drop=True) regions = add_diff_columns(regions, ['tamponi'], group_by='denominazione_regione') regions.rename(columns={ 'nuovi_positivi': 'Daily cases', 'tamponi': 'Tests', "tamponi_diff": "Daily tests" }, inplace=True) regions["New cases/tests ratio"] = [ 100 * (ndc / tamp) if tamp > ndc > 0 else "nan" for ndc, tamp in zip(regions['Daily cases'], regions['Daily tests']) ] # Prediction of "New daily cases" for every region # We also want to plot cross-correlation with other regions. # So, create a dataFrame with only daily cases and regions as columns. regions_names = regions.index.get_level_values(1).unique() regions_names = regions_names.sort_values() datas = regions.index.get_level_values(0).unique().to_list() datas = datas[1:] # Abruzzo is missing the first day. cols = regions_names.to_list() cols = ['data'] + cols daily_cases_regions = DataFrame(columns=cols, dtype=numpy.float64) daily_cases_regions['data'] = datas daily_cases_regions.set_index(['data'], inplace=True, drop=True) for col in daily_cases_regions.columns: for i in daily_cases_regions.index: daily_cases_regions.loc[i][col] = regions.loc[i, col]['Daily cases'] daily_cases_regions = add_freq(daily_cases_regions, 'D') max_lags = param_config['xcorr_parameters']['xcorr_max_lags'] modes = [*param_config['xcorr_parameters']["xcorr_mode"].split(",")] try: max_threads = param_config['max_threads'] except KeyError: try: max_threads = len(os.sched_getaffinity(0)) except: max_threads = 1 for region in daily_cases_regions.columns: timeseries_data = daily_cases_regions[[region]] model_results = {} xcorr = calc_xcorr(region, daily_cases_regions, max_lags, modes) log.info(f"Computing univariate prediction for {region}...") predictor = FBProphetModel(param_config, transformation="none") prophet_result = predictor.launch_model(timeseries_data.copy(), max_threads=max_threads) model_results['fbprophet'] = prophet_result # # predictor = ARIMA(param_config) # arima_result = predictor.launch_model(scenario_data.copy()) # model_results.append(arima_result) s = TimeSeriesContainer(timeseries_data, model_results, xcorr) containers.append(s) # children_for_each_scenario.append({ # 'name': region, # 'children': create_scenario_children(s, param_config) # }) #################################################################################################################### # Save the computed data; these are the TimeSeriesContainer objects from which a nice Dash page can be built. # They can be loaded by "app_load_from_dump.py" to start the app # without re-computing all the data. with open(f"containers.pkl", 'wb') as input_file: pickle.dump(containers, input_file)