def _cache_global_datasets(): # Populate cache for combined latest and timeseries. Caching pre-fork # will make sure cache is populated for subprocesses. Return value # is not needed as the only goal is to populate the cache. combined_datasets.load_us_latest_dataset() combined_datasets.load_us_timeseries_dataset() infer_icu.get_region_weight_map()
def generate_api(input_dir, output, summary_output, aggregation_level, state, fips): """The entry function for invocation""" active_states = [state.abbr for state in us.STATES] us_latest = combined_datasets.load_us_latest_dataset().get_subset( aggregation_level, state=state, fips=fips, states=active_states) us_timeseries = combined_datasets.load_us_timeseries_dataset().get_subset( aggregation_level, state=state, fips=fips, states=active_states) for intervention in list(Intervention): _logger.info(f"Running intervention {intervention.name}") all_timeseries = api_pipeline.run_on_all_fips_for_intervention( us_latest, us_timeseries, intervention, input_dir) county_timeseries = [ output for output in all_timeseries if output.aggregate_level is AggregationLevel.COUNTY ] api_pipeline.deploy_single_level(intervention, county_timeseries, summary_output, output) state_timeseries = [ output for output in all_timeseries if output.aggregate_level is AggregationLevel.STATE ] api_pipeline.deploy_single_level(intervention, state_timeseries, summary_output, output)
def test_update_and_load(tmp_path: pathlib.Path, nyc_fips, nyc_region): us_combined_df = combined_datasets.load_us_timeseries_dataset().combined_df # restricting the datasets being persisted to one county to speed up tests a bit. nyc_combined_df = us_combined_df.loc[us_combined_df[CommonFields.FIPS] == nyc_fips, :] multiregion_timeseries_nyc = MultiRegionTimeseriesDataset.from_combined_dataframe( nyc_combined_df) latest_nyc = LatestValuesDataset( multiregion_timeseries_nyc.latest_data_with_fips.reset_index()) latest_nyc_record = latest_nyc.get_record_for_fips(nyc_fips) assert latest_nyc_record[CommonFields.POPULATION] > 1_000_000 assert latest_nyc_record[CommonFields.LOCATION_ID] combined_dataset_utils.update_data_public_head( tmp_path, latest_dataset=latest_nyc, timeseries_dataset=multiregion_timeseries_nyc, ) timeseries_loaded = combined_datasets.load_us_timeseries_dataset( pointer_directory=tmp_path) latest_loaded = combined_datasets.load_us_latest_dataset( pointer_directory=tmp_path) assert latest_loaded.get_record_for_fips(nyc_fips) == latest_nyc_record assert_combined_like(timeseries_loaded, multiregion_timeseries_nyc)
def generate_top_counties(disable_validation, input_dir, output, state, fips): """The entry function for invocation""" intervention = Intervention.SELECTED_INTERVENTION active_states = [state.abbr for state in us.STATES] us_latest = combined_datasets.load_us_latest_dataset().get_subset( AggregationLevel.COUNTY, states=active_states, state=state, fips=fips) us_timeseries = combined_datasets.load_us_timeseries_dataset().get_subset( AggregationLevel.COUNTY, states=active_states, state=state, fips=fips) def sort_func(output: RegionSummaryWithTimeseries): return -output.projections.totalHospitalBeds.peakShortfall all_timeseries = api_pipeline.run_on_all_fips_for_intervention( us_latest, us_timeseries, Intervention.SELECTED_INTERVENTION, input_dir, sort_func=sort_func, limit=100, ) bulk_timeseries = AggregateRegionSummaryWithTimeseries( __root__=all_timeseries) api_pipeline.deploy_json_api_output( intervention, bulk_timeseries, output, filename_override="counties_top_100.json")
def test_combined_county_has_some_data(fips): latest = combined_datasets.load_us_latest_dataset().get_subset( AggregationLevel.COUNTY, fips=fips ) assert latest.data[CommonFields.POSITIVE_TESTS].all() assert latest.data[CommonFields.NEGATIVE_TESTS].all() assert latest.get_record_for_fips(fips=fips)[CommonFields.DEATHS] > 1
def test_build_api_output_for_intervention(nyc_fips, nyc_model_output_path, tmp_path): county_output = tmp_path / "county" us_latest = combined_datasets.load_us_latest_dataset() us_timeseries = combined_datasets.load_us_timeseries_dataset() nyc_latest = us_latest.get_subset(None, fips=nyc_fips) nyc_timeseries = us_timeseries.get_subset(None, fips=nyc_fips) all_timeseries_api = api_pipeline.run_on_all_fips_for_intervention( nyc_latest, nyc_timeseries, Intervention.STRONG_INTERVENTION, nyc_model_output_path.parent ) api_pipeline.deploy_single_level( Intervention.STRONG_INTERVENTION, all_timeseries_api, tmp_path, county_output ) expected_outputs = [ "counties.STRONG_INTERVENTION.timeseries.json", "counties.STRONG_INTERVENTION.csv", "counties.STRONG_INTERVENTION.timeseries.csv", "counties.STRONG_INTERVENTION.json", "county/36061.STRONG_INTERVENTION.json", "county/36061.STRONG_INTERVENTION.timeseries.json", ] output_paths = [ str(path.relative_to(tmp_path)) for path in tmp_path.glob("**/*") if not path.is_dir() ] assert sorted(output_paths) == sorted(expected_outputs)
def test_generate_timeseries_for_fips(nyc_model_output_path, nyc_region, nyc_rt_dataset, nyc_icu_dataset): us_latest = combined_datasets.load_us_latest_dataset() us_timeseries = combined_datasets.load_us_timeseries_dataset() nyc_latest = us_latest.get_record_for_fips(nyc_region.fips) nyc_timeseries = us_timeseries.get_one_region(nyc_region) metrics_series, latest_metric = api_v2_pipeline.generate_metrics_and_latest( nyc_timeseries, nyc_rt_dataset, nyc_icu_dataset) risk_levels = top_level_metric_risk_levels.calculate_risk_level_from_metrics( latest_metric) region_summary = build_api_v2.build_region_summary(nyc_latest, latest_metric, risk_levels) region_timeseries = build_api_v2.build_region_timeseries( region_summary, nyc_timeseries, metrics_series) summary = build_api_v2.build_region_summary(nyc_latest, latest_metric, risk_levels) assert summary.dict() == region_timeseries.region_summary.dict() # Double checking that serialized json does not contain NaNs, all values should # be serialized using the simplejson wrapper. assert "NaN" not in region_timeseries.json()
def test_generate_timeseries_for_fips( include_projections, nyc_model_output_path, nyc_region, nyc_rt_dataset, nyc_icu_dataset, ): us_latest = combined_datasets.load_us_latest_dataset() us_timeseries = combined_datasets.load_us_timeseries_dataset() nyc_latest = us_latest.get_record_for_fips(nyc_region.fips) nyc_timeseries = us_timeseries.get_one_region(nyc_region) intervention = Intervention.OBSERVED_INTERVENTION model_output = CANPyseirLocationOutput.load_from_path( nyc_model_output_path) metrics_series, latest_metric = api_pipeline.generate_metrics_and_latest( nyc_timeseries, nyc_rt_dataset, nyc_icu_dataset) region_summary = generate_api.generate_region_summary( nyc_latest, latest_metric, model_output) region_timeseries = generate_api.generate_region_timeseries( region_summary, nyc_timeseries, metrics_series, model_output) summary = generate_api.generate_region_summary(nyc_latest, latest_metric, model_output) assert summary.dict() == region_timeseries.region_summary.dict() # Double checking that serialized json does not contain NaNs, all values should # be serialized using the simplejson wrapper. assert "NaN" not in region_timeseries.json()
def test_build_summary_for_fips(include_model_output, rt_null, nyc_region, nyc_icu_dataset, nyc_rt_dataset): us_latest = combined_datasets.load_us_latest_dataset() us_timeseries = combined_datasets.load_us_timeseries_dataset() nyc_latest = us_latest.get_record_for_fips(nyc_region.fips) model_output = None expected_projections = None if include_model_output: if rt_null: nyc_rt_dataset = None else: nyc_icu_dataset = None nyc_rt_dataset = None fips_timeseries = us_timeseries.get_one_region(nyc_region) metrics_series, latest_metric = api_v2_pipeline.generate_metrics_and_latest( fips_timeseries, nyc_rt_dataset, nyc_icu_dataset) risk_levels = top_level_metric_risk_levels.calculate_risk_level_from_metrics( latest_metric) assert latest_metric summary = build_api_v2.build_region_summary(nyc_latest, latest_metric, risk_levels) expected = RegionSummary( population=nyc_latest["population"], state="NY", country="USA", level="county", county="New York County", fips="36061", lat=None, long=None, metrics=latest_metric, riskLevels=risk_levels, actuals=Actuals( cases=nyc_latest["cases"], deaths=nyc_latest["deaths"], positiveTests=nyc_latest["positive_tests"], negativeTests=nyc_latest["negative_tests"], hospitalBeds={ "capacity": nyc_latest["max_bed_count"], "currentUsageCovid": None, "currentUsageTotal": None, "typicalUsageRate": nyc_latest["all_beds_occupancy_rate"], }, icuBeds={ "capacity": nyc_latest["icu_beds"], "totalCapacity": nyc_latest["icu_beds"], "currentUsageCovid": None, "currentUsageTotal": None, "typicalUsageRate": nyc_latest["icu_occupancy_rate"], }, contactTracers=nyc_latest["contact_tracers_count"], ), lastUpdatedDate=datetime.datetime.utcnow(), ) assert expected.dict() == summary.dict()
def test_update_and_load(tmp_path: pathlib.Path, nyc_fips): latest = combined_datasets.load_us_latest_dataset() timeseries_dataset = combined_datasets.load_us_timeseries_dataset() # restricting the datasets being persisted to one county to speed up tests a bit. latest_nyc = latest.get_subset(None, fips=nyc_fips) timeseries_nyc = timeseries_dataset.get_subset(None, fips=nyc_fips) combined_dataset_utils.update_data_public_head( tmp_path, latest_dataset=latest_nyc, timeseries_dataset=timeseries_nyc, ) timeseries = combined_datasets.load_us_timeseries_dataset( pointer_directory=tmp_path) latest = combined_datasets.load_us_latest_dataset( pointer_directory=tmp_path) assert latest.get_record_for_fips( nyc_fips) == latest_nyc.get_record_for_fips(nyc_fips)
def save_combined_latest_csv(csv_path_format, output_dir): """Save the combined datasets latest DataFrame, cleaned up for easier comparisons.""" csv_path = form_path_name(csv_path_format, output_dir) latest = combined_datasets.load_us_latest_dataset() # This is a hacky modification of common_df.write_csv because it requires a date index. latest_data = latest.data.set_index(CommonFields.FIPS).replace({ pd.NA: np.nan }).convert_dtypes() latest_data.to_csv(csv_path, date_format="%Y-%m-%d", index=True, float_format="%.12g")
def load_all_latest_sources(): latest = combined_datasets.load_us_latest_dataset() sources = notebook_helpers.load_data_sources_by_name() sources_latest = { name: source.latest_values() for name, source in sources.items() } combined_latest_data = latest.data.copy() combined_latest_data["source"] = "Combined" sources_latest[COMBINED_DATA_KEY] = combined_latest_data for source_name in sources_latest: data = sources_latest[source_name] # Drop unknown sources invalid_locations = (data[CommonFields.FIPS].str.endswith("999")) | ( data[CommonFields.FIPS].str.startswith("90")) data = data.loc[~invalid_locations] sources_latest[source_name] = LatestValuesDataset(data) return sources_latest
def test_build_timeseries_and_summary_outputs(nyc_model_output_path, nyc_fips, intervention): us_latest = combined_datasets.load_us_latest_dataset() us_timeseries = combined_datasets.load_us_timeseries_dataset() timeseries = api_pipeline.build_timeseries_for_fips( intervention, us_latest, us_timeseries, nyc_model_output_path.parent, nyc_fips ) if intervention is Intervention.NO_INTERVENTION: # Test data does not contain no intervention model, should not output any results. assert not timeseries return assert timeseries if intervention is Intervention.STRONG_INTERVENTION: assert timeseries.projections assert timeseries.timeseries elif intervention is Intervention.OBSERVED_INTERVENTION: assert not timeseries.projections assert not timeseries.timeseries
def test_unique_index_values_us_latest(): latest = combined_datasets.load_us_latest_dataset() latest_data = latest.data.set_index(latest.INDEX_FIELDS) duplicates = latest_data.index.duplicated() assert not sum(duplicates)
def save_combined_latest_csv(csv_path_format, output_dir): """Save the combined datasets latest DataFrame, cleaned up for easier comparisons.""" csv_path = form_path_name(csv_path_format, output_dir) latest = combined_datasets.load_us_latest_dataset() latest.to_csv(csv_path)
def test_build_summary_for_fips( include_projections, rt_null, nyc_model_output_path, nyc_region, nyc_rt_dataset, nyc_icu_dataset, ): us_latest = combined_datasets.load_us_latest_dataset() us_timeseries = combined_datasets.load_us_timeseries_dataset() nyc_latest = us_latest.get_record_for_fips(nyc_region.fips) model_output = None expected_projections = None intervention = Intervention.OBSERVED_INTERVENTION if include_projections: model_output = CANPyseirLocationOutput.load_from_path( nyc_model_output_path) if rt_null: model_output.data[schema.RT_INDICATOR] = None model_output.data[schema.RT_INDICATOR_CI90] = None rt = model_output.latest_rt rt_ci_90 = model_output.latest_rt_ci90 expected_projections = Projections( totalHospitalBeds=ResourceUsageProjection(peakShortfall=0, peakDate=datetime.date( 2020, 4, 15), shortageStartDate=None), ICUBeds=None, Rt=rt, RtCI90=rt_ci_90, ) intervention = Intervention.STRONG_INTERVENTION fips_timeseries = us_timeseries.get_one_region(nyc_region) metrics_series, latest_metric = api_pipeline.generate_metrics_and_latest( fips_timeseries, nyc_rt_dataset, nyc_icu_dataset) assert latest_metric summary = generate_api.generate_region_summary(nyc_latest, latest_metric, model_output) expected = RegionSummary( population=nyc_latest["population"], stateName="New York", countyName="New York County", fips="36061", lat=None, long=None, metrics=latest_metric, actuals=Actuals( population=nyc_latest["population"], intervention="STRONG_INTERVENTION", cumulativeConfirmedCases=nyc_latest["cases"], cumulativeDeaths=nyc_latest["deaths"], cumulativePositiveTests=nyc_latest["positive_tests"], cumulativeNegativeTests=nyc_latest["negative_tests"], hospitalBeds={ # Manually calculated from capacity calculation in generate_api.py "capacity": 12763, "totalCapacity": nyc_latest["max_bed_count"], "currentUsageCovid": None, "currentUsageTotal": None, "typicalUsageRate": nyc_latest["all_beds_occupancy_rate"], }, ICUBeds={ "capacity": nyc_latest["icu_beds"], "totalCapacity": nyc_latest["icu_beds"], "currentUsageCovid": None, "currentUsageTotal": None, "typicalUsageRate": nyc_latest["icu_occupancy_rate"], }, contactTracers=nyc_latest["contact_tracers_count"], ), lastUpdatedDate=datetime.datetime.utcnow(), projections=expected_projections, ) import pprint pprint.pprint(expected.actuals.ICUBeds.dict()) pprint.pprint(summary.actuals.ICUBeds.dict()) assert expected.dict() == summary.dict()