def _check_counterfactual(self, scenario, scenario_npis): # Simulate Italy lifted all NPI for this period start_date_str = "2020-03-20" end_date_str = "2020-03-26" countries = ["Italy"] scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario=scenario) self.assertIsNotNone(scenario_df) # Misleading name but checks the elements, regardless of order self.assertCountEqual(countries, scenario_df.CountryName.unique(), "Not the requested countries") self.assertFalse(scenario_df["Date"].duplicated().any(), "Expected 1 row per date only") start_date = pd.to_datetime(start_date_str, format=DATE_FORMAT) end_date = pd.to_datetime(end_date_str, format=DATE_FORMAT) before_day = start_date - np.timedelta64(1, 'D') before_day_npis = scenario_df[scenario_df.Date == before_day][NPI_COLUMNS].reset_index(drop=True) before_day_npis_truth = self.latest_df[(self.latest_df.CountryName == "Italy") & (self.latest_df.Date == before_day)][NPI_COLUMNS].reset_index(drop=True) # Check the day before the scenario is correct pd.testing.assert_frame_equal(before_day_npis_truth, before_day_npis, "Not the expected frozen NPIs") # For the right period (+1 to include start and end date) nb_days = (end_date - start_date).days + 1 for i in range(nb_days): check_day = start_date + np.timedelta64(i, 'D') check_day_npis_df = scenario_df[scenario_df.Date == check_day][NPI_COLUMNS].reset_index(drop=True) check_day_npis = list(check_day_npis_df.values[0]) self.assertListEqual(scenario_npis, check_day_npis) # Check Mar 27 is different from frozen day after_day = end_date + np.timedelta64(1, 'D') after_day_npis_df = scenario_df[scenario_df.Date == after_day][NPI_COLUMNS].reset_index(drop=True) self.assertTrue((scenario_npis - after_day_npis_df.values[0]).any(), "Expected NPIs to be different") # Check 27 is indeed equal to truth after_day_npis_truth = self.latest_df[(self.latest_df.CountryName == "Italy") & (self.latest_df.Date == after_day) ][NPI_COLUMNS].reset_index(drop=True) pd.testing.assert_frame_equal(after_day_npis_truth, after_day_npis_df, "Not the expected unfrozen NPIs")
def test_generate_scenario_future_from_last_known_date_custom(self): # Simulate Italy uses custom NPIs for 1 week after last known date countries = ["Italy"] last_known_date = self.latest_df[self.latest_df.CountryName == "Italy"].Date.max() start_date = last_known_date + np.timedelta64(1, 'D') # Set end date to 1 week from last known date end_date_str = (last_known_date + pd.DateOffset(7)).strftime(DATE_FORMAT) end_date = pd.to_datetime(end_date_str, format=DATE_FORMAT) nb_days = (end_date - start_date).days + 1 # +1 to include start date scenario = [ONE_NPIS] * nb_days # Generate the scenario scenario_df = generate_scenario(None, end_date_str, self.latest_df, countries, scenario=scenario) # Check it self._check_future(start_date_str=None, end_date_str=end_date_str, scenario_df=scenario_df, scenario_npis=scenario[0], country=countries[0])
def test_generate_scenario_future_from_last_known_date_max(self): # Simulate Italy maxes out NPIs for 1 week after last known date countries = ["Italy"] start_date_str = None # Set end date to 1 week from last known date last_known_date = self.latest_df[self.latest_df.CountryName == "Italy"].Date.max() end_date_str = (last_known_date + pd.DateOffset(7)).strftime(DATE_FORMAT) scenario = "MAX" scenario_npis = MAX_NPIS # Generate the scenario scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario=scenario) # Check it self._check_future( start_date_str=start_date_str, end_date_str=end_date_str, scenario_df=scenario_df[scenario_df.CountryName == countries[0]], scenario_npis=scenario_npis, country=countries[0])
def get_predictions(start_date_str, end_date_str, pres_df, countries=None): # Concatenate prescriptions with historical data raw_df = get_raw_data(HIST_DATA_FILE_PATH) hist_df = generate_scenario(start_date_str, end_date_str, raw_df, countries=countries, scenario='Historical') start_date = pd.to_datetime(start_date_str, format='%Y-%m-%d') hist_df = hist_df[hist_df.Date < start_date] ips_df = pd.concat([hist_df, pres_df]) with tempfile.NamedTemporaryFile() as tmp_ips_file: # Write ips_df to file ips_df.to_csv(tmp_ips_file.name) with tempfile.NamedTemporaryFile() as tmp_pred_file: # Run script to generate predictions output_str = subprocess.check_output([ 'python3', PREDICT_MODULE, '--start_date', start_date_str, '--end_date', end_date_str, '--interventions_plan', tmp_ips_file.name, '--output_file', tmp_pred_file.name ], stderr=subprocess.STDOUT) # Print output from running script print(output_str.decode("utf-8")) # Load predictions to return # print(f"Generating predictions from {start_date_str} to {start_date_str} from {tmp_pred_file.name} . {tmp_ips_file.name}..") # predict(start_date_str, start_date_str, tmp_ips_file.name, tmp_pred_file.name) df = pd.read_csv(tmp_pred_file) return df
def test_generate_scenario_all_countries_future_from_last_known_date_freeze( self): # Simulate ALL countries uses custom NPIs for 1 week after last known date countries = None # Set end date to 1 week from last known date last_known_date = self.latest_df.Date.max() end_date_str = (last_known_date + pd.DateOffset(7)).strftime(DATE_FORMAT) # Make sure we generate scenarios for enough days nb_days = 14 scenario = [ONE_NPIS] * nb_days # Generate the scenarios scenario_df = generate_scenario(None, end_date_str, self.latest_df, countries, scenario=scenario) # Check them all_countries = self.latest_df.CountryName.unique() for country in all_countries: all_regions = self.latest_df[self.latest_df.CountryName == country].RegionName.unique() for region in all_regions: self._check_future(start_date_str=None, end_date_str=end_date_str, scenario_df=scenario_df[ (scenario_df.CountryName == country) & (scenario_df.RegionName == region)], scenario_npis=scenario[0], country=country, region=region)
def test_generate_scenario_mind_the_gap_custom(self): # Scenario = Custom start_date_str = "2021-01-01" end_date_str = "2021-01-31" countries = ["Italy"] # Set all the NPIs to one for each day between start data and end date. scenario = [ONE_NPIS] * 31 scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario=scenario) self.assertIsNotNone(scenario_df) # Misleading name but checks the elements, regardless of order self.assertCountEqual(countries, scenario_df.CountryName.unique(), "Not the requested countries") # Inception is 2020-01-01. 366 days for 2020 + 31 for Jan 2021 self.assertEqual( 397, len(scenario_df), "Expected the number of days between inception and end date") # The last 31 rows must be the same self.assertEqual( 1, scenario_df.tail(31)[NPI_COLUMNS].mean().mean(), "Expected the last 31 rows to have all NPIs set to 1")
def test_generate_scenario_all_countries_future_from_last_known_date_freeze( self): # Simulate ALL countries uses custom NPIs for the rest of the year countries = None end_date_str = "2020-12-31" # Make sure we generate scenarios for enough days nb_days = 180 scenario = [ONE_NPIS] * nb_days # Generate the scenarios scenario_df = generate_scenario(None, end_date_str, self.latest_df, countries, scenario=scenario) # Check them all_countries = self.latest_df.CountryName.unique() for country in all_countries: all_regions = self.latest_df[self.latest_df.CountryName == country].RegionName.unique() for region in all_regions: self._check_future(start_date_str=None, end_date_str=end_date_str, scenario_df=scenario_df[ (scenario_df.CountryName == country) & (scenario_df.RegionName == region)], scenario_npis=scenario[0], country=country, region=region)
def test_generate_scenario_mind_the_gap_freeze_all_countries(self): # Check all countries, with frozen npis for 180 days, 1 week from today start_date = datetime.now() + timedelta(days=7) start_date_str = start_date.strftime(DATE_FORMAT) end_date = start_date + timedelta(days=180) end_date_str = end_date.strftime(DATE_FORMAT) inception_date = datetime.strptime(INCEPTION_DATE, DATE_FORMAT) countries = None scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario="Freeze") self.assertIsNotNone(scenario_df) nb_days_since_inception = (end_date - inception_date).days + 1 # For each country, assert the scenario contains the expected number of days for country in self.latest_df.CountryName.unique(): all_regions = self.latest_df[self.latest_df.CountryName == country].RegionName.unique() for region in all_regions: ips_gdf = scenario_df[(scenario_df.CountryName == country) & (scenario_df.RegionName == region)] self.assertEqual( nb_days_since_inception, len(ips_gdf), f"Not the expected number of days" f" for {country} / {region}")
def test_generate_scenario_future_from_last_known_date_freeze(self): # Simulate Italy freezes NPIS for the rest of the year countries = ["Italy"] start_date_str = None end_date_str = "2020-12-31" scenario = "Freeze" last_known_date = self.latest_df[self.latest_df.CountryName == "Italy"].Date.max() frozen_npis_df = self.latest_df[ (self.latest_df.CountryName == "Italy") & (self.latest_df.Date == last_known_date)][NPI_COLUMNS].reset_index( drop=True) scenario_npis = list(frozen_npis_df.values[0]) # Generate the scenario scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario=scenario) # Check it self._check_future( start_date_str=start_date_str, end_date_str=end_date_str, scenario_df=scenario_df[scenario_df.CountryName == countries[0]], scenario_npis=scenario_npis, country=countries[0])
def test_generate_scenario_mind_the_gap_freeze_dates_mismatch(self): # Check scenario contains all days, for 2 countries, where 1 country has 1 more day of data than the other # Last known date: # - Belgium: 20201103 # - Brazil: 20201104 # Make sure we don't skip a day start_date_str = "2021-01-01" end_date_str = "2021-01-31" countries = ["Belgium", "Brazil"] dates_mismatch_df = get_raw_data(DATES_MISMATCH_DATA_FILE, latest=False) scenario_df = generate_scenario(start_date_str, end_date_str, dates_mismatch_df, countries, scenario="Freeze") self.assertIsNotNone(scenario_df) # Misleading name but checks the elements, regardless of order self.assertCountEqual(countries, scenario_df.CountryName.unique(), "Not the requested countries") # Inception is 2020-01-01. 366 days for 2020 + 31 for Jan 2021 nb_days_since_inception = 397 # For each country, assert the scenario contains the expected number of days for country in countries: all_regions = dates_mismatch_df[dates_mismatch_df.CountryName == country].RegionName.unique() for region in all_regions: ips_gdf = scenario_df[(scenario_df.CountryName == country) & (scenario_df.RegionName == region)] self.assertEqual( nb_days_since_inception, len(ips_gdf), f"Not the expected number of days" f" for {country} / {region}")
def test_generate_scenario_future_from_last_known_date_freeze(self): # Simulate Italy freezes NPIS for 1 week after last known date countries = ["Italy"] start_date_str = None scenario = "Freeze" # Set end date to 1 week from last known date last_known_date = self.latest_df[self.latest_df.CountryName == "Italy"].Date.max() end_date_str = (last_known_date + pd.DateOffset(7)).strftime(DATE_FORMAT) frozen_npis_df = self.latest_df[ (self.latest_df.CountryName == "Italy") & (self.latest_df.Date == last_known_date)][NPI_COLUMNS].reset_index( drop=True) scenario_npis = list(frozen_npis_df.values[0]) # Generate the scenario scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario=scenario) # Check it self._check_future( start_date_str=start_date_str, end_date_str=end_date_str, scenario_df=scenario_df[scenario_df.CountryName == countries[0]], scenario_npis=scenario_npis, country=countries[0])
def test_generate_scenario_future_freeze(self): # Simulate Italy froze it's NPIS for the second part of the year countries = ["Italy"] start_date_str = "2020-07-01" end_date_str = "2020-12-31" scenario = "Freeze" before_day = pd.to_datetime("2020-06-30", format=DATE_FORMAT) frozen_npis_df = self.latest_df[ (self.latest_df.CountryName == "Italy") & (self.latest_df.Date == before_day)][NPI_COLUMNS].reset_index( drop=True) scenario_npis = list(frozen_npis_df.values[0]) # Generate the scenario scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario=scenario) # Check it self._check_future( start_date_str=start_date_str, end_date_str=end_date_str, scenario_df=scenario_df[scenario_df.CountryName == countries[0]], scenario_npis=scenario_npis, country=countries[0])
def get_predictions(start_date_str, end_date_str, pres_df, countries=None): # Concatenate prescriptions with historical data raw_df = get_raw_data(HIST_DATA_FILE_PATH) hist_df = generate_scenario(start_date_str, end_date_str, raw_df, countries=countries, scenario='Historical') start_date = pd.to_datetime(start_date_str, format='%Y-%m-%d') hist_df = hist_df[hist_df.Date < start_date] ips_df = pd.concat([hist_df, pres_df]) with tempfile.NamedTemporaryFile() as tmp_ips_file: xp = XPrizePredictor() ips_df.to_csv(tmp_ips_file.name) x = xp.predict(start_date_str, end_date_str, tmp_ips_file.name) return x
def test_generate_scenario_mind_the_gap_freeze_2_countries(self): # Check 2 countries start_date_str = "2021-01-01" end_date_str = "2021-01-31" countries = ["France", "Italy"] scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario="Freeze") self.assertIsNotNone(scenario_df) # Misleading name but checks the elements, regardless of order self.assertCountEqual(countries, scenario_df.CountryName.unique(), "Not the requested countries") # Inception is 2020-01-01. 366 days for 2020 + 31 for Jan 2021 self.assertEqual( 397 * 2, len(scenario_df), "Not the expected number of days between inception and end date")
def get_predictions(start_date_str, end_date_str, pres_df, countries=None): # Concatenate prescriptions with historical data raw_df = get_raw_data(HIST_DATA_FILE_PATH) hist_df = generate_scenario(start_date_str, end_date_str, raw_df, countries=countries, scenario='Historical') start_date = pd.to_datetime(start_date_str, format='%Y-%m-%d') hist_df = hist_df[hist_df.Date < start_date] ips_df = pd.concat([hist_df, pres_df]) # Write ips_df to file ips_df.to_csv(TMP_PRESCRIPTION_FILE) # Use full path of the local file passed as ip_file ip_file_full_path = os.path.abspath(TMP_PRESCRIPTION_FILE) # Go to covid-xprize root dir to access predict script wd = os.getcwd() os.chdir("../../../..") # Run script to generate predictions output_str = subprocess.check_output( [ 'python', PREDICT_MODULE, '--start_date', start_date_str, '--end_date', end_date_str, '--interventions_plan', ip_file_full_path, '--output_file', TMP_PRED_FILE_NAME ], stderr=subprocess.STDOUT ) # Print output from running script print(output_str.decode("utf-8")) # Load predictions to return df = pd.read_csv(TMP_PRED_FILE_NAME) # Return to prescriptor dir os.chdir(wd) return df
def test_generate_scenario_mind_the_gap_freeze_2_countries(self): # Check 2 countries nb_days = 31 # We have 2 countries, and they may have different last know dates. # Set start date to 7 days from now to guarantee a gap start_date = datetime.now() + timedelta(days=7) start_date_str = start_date.strftime(DATE_FORMAT) end_date = start_date + timedelta(days=nb_days) end_date_str = end_date.strftime(DATE_FORMAT) inception_date = pd.to_datetime(INCEPTION_DATE, format=DATE_FORMAT) countries = ["France", "Italy"] scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario="Freeze") self.assertIsNotNone(scenario_df) # Misleading name but checks the elements, regardless of order self.assertCountEqual(countries, scenario_df.CountryName.unique(), "Not the requested countries") # Check we get the expected number of days expected_days = (end_date - inception_date).days + 1 # +1 because inception_date and end_date are included self.assertEqual(expected_days * 2, len(scenario_df), "Not the expected number of days between inception and end date")
def test_generate_scenario_mind_the_gap_max(self): # Scenario = MAX nb_days = 31 countries = ["Italy"] last_known_date = self.latest_df[self.latest_df.CountryName == countries[0]].Date.max() start_date = last_known_date + timedelta(days=7) start_date_str = start_date.strftime(DATE_FORMAT) end_date = start_date + timedelta(days=nb_days) end_date_str = end_date.strftime(DATE_FORMAT) inception_date = pd.to_datetime(INCEPTION_DATE, format=DATE_FORMAT) scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario="MAX") self.assertIsNotNone(scenario_df) # Misleading name but checks the elements, regardless of order self.assertCountEqual(countries, scenario_df.CountryName.unique(), "Not the requested countries") # Check we get the expected number of days expected_days = (end_date - inception_date).days + 1 # +1 because inception_date and end_date are included self.assertEqual(expected_days, len(scenario_df), "Expected the number of days between inception and end date") # The last nb_days rows must be the same self.assertEqual(sum(MAX_NPIS), scenario_df.tail(nb_days)[NPI_COLUMNS].mean().sum(), f"Expected the last {nb_days} rows to have NPIs set to their max value")
def test_generate_scenario_future_from_last_known_date_max(self): # Simulate Italy maxes out NPIs for the rest of the year countries = ["Italy"] start_date_str = None end_date_str = "2020-12-31" scenario = "MAX" scenario_npis = MAX_NPIS # Generate the scenario scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario=scenario) # Check it self._check_future( start_date_str=start_date_str, end_date_str=end_date_str, scenario_df=scenario_df[scenario_df.CountryName == countries[0]], scenario_npis=scenario_npis, country=countries[0])
def test_generate_scenario_mind_the_gap_min(self): # Scenario = MIN start_date_str = "2021-01-01" end_date_str = "2021-01-31" countries = ["Italy"] scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario="MIN") self.assertIsNotNone(scenario_df) # Misleading name but checks the elements, regardless of order self.assertCountEqual(countries, scenario_df.CountryName.unique(), "Not the requested countries") # Inception is 2020-01-01. 366 days for 2020 + 31 for Jan 2021 self.assertEqual( 397, len(scenario_df), "Expected the number of days between inception and end date") # The last 31 rows must be the same self.assertEqual(0, scenario_df.tail(31)[NPI_COLUMNS].sum().sum(), "Expected the last 31 rows to have NPIs set to 0")
def test_generate_scenario_future_max(self): # Simulate Italy maxed out all NPIs for a period countries = ["Italy"] start_date_str = "2020-07-01" end_date_str = "2020-12-31" scenario = "MAX" scenario_npis = MAX_NPIS # Generate the scenario scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario=scenario) # Check it self._check_future( start_date_str=start_date_str, end_date_str=end_date_str, scenario_df=scenario_df[scenario_df.CountryName == countries[0]], scenario_npis=scenario_npis, country=countries[0])
def test_generate_scenario_future_custom(self): # Simulate Italy used custom NPIs for a period: each NPI set to 1 for 7 consecutive days countries = ["Italy"] start_date_str = "2020-07-01" end_date_str = "2020-12-31" start_date = pd.to_datetime(start_date_str, format=DATE_FORMAT) end_date = pd.to_datetime(end_date_str, format=DATE_FORMAT) nb_days = (end_date - start_date).days + 1 # +1 to include start date scenario = [ONE_NPIS] * nb_days # Generate the scenario scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario=scenario) # Check it self._check_future( start_date_str=start_date_str, end_date_str=end_date_str, scenario_df=scenario_df[scenario_df.CountryName == countries[0]], scenario_npis=scenario[0], country=countries[0])
def test_generate_scenario_future_from_last_known_date_custom(self): # Simulate Italy uses custom NPIs for the rest of the year countries = ["Italy"] last_known_date = self.latest_df[self.latest_df.CountryName == "Italy"].Date.max() start_date = last_known_date + np.timedelta64(1, 'D') end_date_str = "2020-12-31" end_date = pd.to_datetime(end_date_str, format=DATE_FORMAT) nb_days = (end_date - start_date).days + 1 # +1 to include start date scenario = [ONE_NPIS] * nb_days # Generate the scenario scenario_df = generate_scenario(None, end_date_str, self.latest_df, countries, scenario=scenario) # Check it self._check_future(start_date_str=None, end_date_str=end_date_str, scenario_df=scenario_df, scenario_npis=scenario[0], country=countries[0])
def test_generate_scenario_mind_the_gap_custom(self): # Scenario = Custom nb_days = 31 start_lag = 7 countries = ["Italy"] last_known_date = self.latest_df[self.latest_df.CountryName == countries[0]].Date.max() start_date = last_known_date + timedelta(days=start_lag) start_date_str = start_date.strftime(DATE_FORMAT) end_date = start_date + timedelta(days=nb_days) end_date_str = end_date.strftime(DATE_FORMAT) inception_date = pd.to_datetime(INCEPTION_DATE, format=DATE_FORMAT) # Set all the NPIs to one for each day between start date and end date, as well as from last known date. scenario = [ONE_NPIS] * (nb_days + start_lag) scenario_df = generate_scenario(start_date_str, end_date_str, self.latest_df, countries, scenario=scenario) self.assertIsNotNone(scenario_df) # Misleading name but checks the elements, regardless of order self.assertCountEqual(countries, scenario_df.CountryName.unique(), "Not the requested countries") # Check we get the expected number of days expected_days = (end_date - inception_date).days + 1 # +1 because inception_date and end_date are included self.assertEqual(expected_days, len(scenario_df), "Expected the number of days between inception and end date") # The last 31 rows must be the same self.assertEqual(1, scenario_df.tail(nb_days)[NPI_COLUMNS].mean().mean(), f"Expected the last {nb_days} rows to have all NPIs set to 1")
# Evaluate some example submissions. # __NOTE: Please run the corresponding example notebooks first in order to train the models that are used in this section.__ # In[ ]: IP_FILE = "predictions/robojudge_test_scenario.csv" predictions = {} # In[ ]: from covid_xprize.validation.scenario_generator import generate_scenario countries = None scenario_df = generate_scenario(start_date_str, end_date_str, latest_df, countries, scenario="Freeze") # Remove countries that weren't there at the beginning of the challenge or duplicates ignore_countries = ["Tonga", "United States Virgin Islands"] scenario_df = scenario_df[scenario_df.CountryName.isin(ignore_countries) == False] # IP_FILE = "covid_xprize/validation/data/2020-09-30_historical_ip.csv" scenario_df.to_csv(IP_FILE, index=False) # ## Linear # In[ ]: # Check a model has been trained if not os.path.isfile(