def _cache_global_datasets(): # Populate cache for combined latest and timeseries. Caching pre-fork # will make sure cache is populated for subprocesses. Return value # is not needed as the only goal is to populate the cache. combined_datasets.build_us_latest_with_all_fields() combined_datasets.build_us_timeseries_with_all_fields() global nyt_dataset, cds_dataset if cds_dataset is None: cds_dataset = CDSDataset.local() if nyt_dataset is None: nyt_dataset = NYTimesDataset.local()
def update_data_public_head( data_directory: pathlib.Path, latest_dataset: latest_values_dataset.LatestValuesDataset = None, timeseries_dataset: timeseries.TimeseriesDataset = None, ) -> Tuple[DatasetPointer, DatasetPointer]: """Persists US latest and timeseries dataset and saves dataset pointers for Latest tag. Args: data_directory: Directory to save dataset and pointer. pointer_path_dir: Directory to save DatasetPointer files. latest_dataset: Optionally specify a LatestValuesDataset to persist instead of building from head. Generally used in testing to sidestep building entire dataset. timeseries_dataset: Optionally specify a TimeseriesDataset to persist instead of building from head. Generally used in testing to sidestep building entire dataset. Returns: Tuple of DatasetPointers to latest and timeseries datasets. """ if not latest_dataset: latest_dataset = combined_datasets.build_us_latest_with_all_fields(skip_cache=True) latest_pointer = persist_dataset(latest_dataset, data_directory) if not timeseries_dataset: timeseries_dataset = combined_datasets.build_us_timeseries_with_all_fields(skip_cache=True) timeseries_pointer = persist_dataset(timeseries_dataset, data_directory) return latest_pointer, timeseries_pointer
def test_build_api_output_for_intervention(nyc_fips, nyc_model_output_path, tmp_path): county_output = tmp_path / "county" us_latest = combined_datasets.build_us_latest_with_all_fields() us_timeseries = combined_datasets.build_us_timeseries_with_all_fields() nyc_latest = us_latest.get_subset(None, fips=nyc_fips) nyc_timeseries = us_timeseries.get_subset(None, fips=nyc_fips) all_timeseries_api = api_pipeline.run_on_all_fips_for_intervention( nyc_latest, nyc_timeseries, Intervention.STRONG_INTERVENTION, nyc_model_output_path.parent) api_pipeline.deploy_single_level(Intervention.STRONG_INTERVENTION, all_timeseries_api, tmp_path, county_output) expected_outputs = [ "counties.STRONG_INTERVENTION.timeseries.json", "counties.STRONG_INTERVENTION.csv", "counties.STRONG_INTERVENTION.timeseries.csv", "counties.STRONG_INTERVENTION.json", "county/36061.STRONG_INTERVENTION.json", "county/36061.STRONG_INTERVENTION.timeseries.json", ] output_paths = [ str(path.relative_to(tmp_path)) for path in tmp_path.glob("**/*") if not path.is_dir() ] assert sorted(output_paths) == sorted(expected_outputs)
def generate_api(input_dir, output, summary_output, aggregation_level, state, fips): """The entry function for invocation""" active_states = [state.abbr for state in us.STATES] us_latest = combined_datasets.build_us_latest_with_all_fields().get_subset( aggregation_level, state=state, fips=fips, states=active_states) us_timeseries = combined_datasets.build_us_timeseries_with_all_fields( ).get_subset(aggregation_level, state=state, fips=fips, states=active_states) for intervention in list(Intervention): _logger.info(f"Running intervention {intervention.name}") all_timeseries = api_pipeline.run_on_all_fips_for_intervention( us_latest, us_timeseries, intervention, input_dir) county_timeseries = [ output for output in all_timeseries if output.aggregate_level is AggregationLevel.COUNTY ] api_pipeline.deploy_single_level(intervention, county_timeseries, summary_output, output) state_timeseries = [ output for output in all_timeseries if output.aggregate_level is AggregationLevel.STATE ] api_pipeline.deploy_single_level(intervention, state_timeseries, summary_output, output)
def generate_top_counties(disable_validation, input_dir, output, state, fips): """The entry function for invocation""" intervention = Intervention.SELECTED_INTERVENTION active_states = [state.abbr for state in us.STATES] us_latest = combined_datasets.build_us_latest_with_all_fields().get_subset( AggregationLevel.COUNTY, states=active_states, state=state, fips=fips) us_timeseries = combined_datasets.build_us_timeseries_with_all_fields( ).get_subset(AggregationLevel.COUNTY, states=active_states, state=state, fips=fips) def sort_func(output: CovidActNowAreaTimeseries): return -output.projections.totalHospitalBeds.peakShortfall all_timeseries = api_pipeline.run_on_all_fips_for_intervention( us_latest, us_timeseries, Intervention.SELECTED_INTERVENTION, input_dir, sort_func=sort_func, limit=100, ) bulk_timeseries = CovidActNowBulkTimeseries(__root__=all_timeseries) api_pipeline.deploy_json_api_output( intervention, bulk_timeseries, output, filename_override="counties_top_100.json")
def save_combined_latest_csv(csv_path_format, output_dir): """Save the combined datasets latest DataFrame, cleaned up for easier comparisons.""" csv_path = form_path_name(csv_path_format, output_dir) latest = combined_datasets.build_us_latest_with_all_fields() # This is a hacky modification of common_df.write_csv because it requires a date index. latest_data = latest.data.set_index(CommonFields.FIPS).replace({ pd.NA: np.nan }).convert_dtypes() latest_data.to_csv(csv_path, date_format="%Y-%m-%d", index=True, float_format="%.12g")
def generate_api_for_state_projection_row(projection_row) -> CovidActNowStateSummary: state_abbrev = US_STATE_ABBREV[projection_row[rc.STATE_FULL_NAME]] projections = _generate_api_for_projections(projection_row) state_intervention = get_can_projection.get_intervention_for_state(state_abbrev) state_actuals = combined_datasets.build_us_latest_with_all_fields().get_record_for_state(state_abbrev) state_result = CovidActNowStateSummary( population=state_actuals[CommonFields.POPULATION], lat=projection_row[rc.LATITUDE], long=projection_row[rc.LONGITUDE], actuals=_generate_actuals(state_actuals, state_intervention), stateName=projection_row[rc.STATE_FULL_NAME], fips=projection_row[rc.FIPS], lastUpdatedDate=_format_date(projection_row[rc.LAST_UPDATED]), projections=projections, ) return state_result
def generate_state_timeseries( projection_row, intervention, input_dir ) -> CovidActNowStateTimeseries: state = US_STATE_ABBREV[projection_row[rc.STATE_FULL_NAME]] fips = projection_row[rc.FIPS] raw_dataseries = get_can_projection.get_can_raw_data( input_dir, state, fips, AggregationLevel.STATE, intervention ) # join in state testing data onto the timeseries # left join '%m/%d/%y', so the left join gracefully handles # missing state testing data (i.e. NE) testing_df = get_testing_timeseries_by_state(state) new_df = pd.DataFrame(raw_dataseries).merge(testing_df, on="date", how="left") can_dataseries = new_df.to_dict(orient="records") timeseries = [] for data_series in can_dataseries: timeseries.append(_generate_state_timeseries_row(data_series)) projections = _generate_api_for_projections(projection_row) if len(timeseries) < 1: raise Exception(f"State time series empty for {intervention.name}") state_intervention = get_can_projection.get_intervention_for_state(state) actuals_ts = combined_datasets.build_us_timeseries_with_all_fields() actual_latest = combined_datasets.build_us_latest_with_all_fields() state_latest = actual_latest.get_record_for_state(state) return CovidActNowStateTimeseries( population=state_latest[CommonFields.POPULATION], lat=projection_row[rc.LATITUDE], long=projection_row[rc.LONGITUDE], actuals=_generate_actuals( state_latest, state_intervention ), stateName=projection_row[rc.STATE_FULL_NAME], fips=projection_row[rc.FIPS], lastUpdatedDate=_format_date(projection_row[rc.LAST_UPDATED]), projections=projections, timeseries=timeseries, actuals_timeseries=_generate_actuals_timeseries( actuals_ts.get_records_for_state(state), state_intervention ), )
def generate_county_timeseries(projection_row, intervention, input_dir): state_abbrev = US_STATE_ABBREV[projection_row[rc.STATE_FULL_NAME]] fips = projection_row[rc.FIPS] raw_dataseries = get_can_projection.get_can_raw_data( input_dir, state_abbrev, fips, AggregationLevel.COUNTY, intervention ) testing_df = get_testing_timeseries_by_fips(fips) new_df = pd.DataFrame(raw_dataseries).merge(testing_df, on="date", how="left") can_dataseries = new_df.to_dict(orient="records") timeseries = [] for data_series in can_dataseries: timeseries.append(_generate_county_timeseries_row(data_series)) if len(timeseries) < 1: raise Exception(f"County time series empty for {intervention.name}") projections = _generate_api_for_projections(projection_row) state_intervention = get_can_projection.get_intervention_for_state(state_abbrev) actuals_ts = combined_datasets.build_us_timeseries_with_all_fields() actual_latest = combined_datasets.build_us_latest_with_all_fields() fips_latest = actual_latest.get_record_for_fips(fips) return CovidActNowCountyTimeseries( population=fips_latest[CommonFields.POPULATION], lat=projection_row[rc.LATITUDE], long=projection_row[rc.LONGITUDE], actuals=_generate_actuals( fips_latest, state_intervention ), stateName=projection_row[rc.STATE_FULL_NAME], countyName=projection_row[rc.COUNTY], fips=projection_row[rc.FIPS], lastUpdatedDate=_format_date(projection_row[rc.LAST_UPDATED]), projections=projections, timeseries=timeseries, actuals_timeseries=_generate_actuals_timeseries( actuals_ts.get_records_for_fips(fips), state_intervention ), )
def test_generate_timeseries_for_fips(include_projections, nyc_model_output_path, nyc_fips): us_latest = combined_datasets.build_us_latest_with_all_fields() us_timeseries = combined_datasets.build_us_timeseries_with_all_fields() nyc_latest = us_latest.get_record_for_fips(nyc_fips) nyc_timeseries = us_timeseries.get_subset(None, fips=nyc_fips) intervention = Intervention.OBSERVED_INTERVENTION model_output = CANPyseirLocationOutput.load_from_path( nyc_model_output_path) area_summary = generate_api.generate_area_summary(nyc_latest, model_output) area_timeseries = generate_api.generate_area_timeseries( area_summary, nyc_timeseries, model_output) summary = generate_api.generate_area_summary(nyc_latest, model_output) assert summary.dict() == area_timeseries.area_summary.dict() # Double checking that serialized json does not contain NaNs, all values should # be serialized using the simplejson wrapper. assert "NaN" not in area_timeseries.json()
def test_build_timeseries_and_summary_outputs(nyc_model_output_path, nyc_fips, intervention): us_latest = combined_datasets.build_us_latest_with_all_fields() us_timeseries = combined_datasets.build_us_timeseries_with_all_fields() timeseries = api_pipeline.build_timeseries_for_fips( intervention, us_latest, us_timeseries, nyc_model_output_path.parent, nyc_fips) if intervention is Intervention.NO_INTERVENTION: # Test data does not contain no intervention model, should not output any results. assert not timeseries return assert timeseries if intervention is Intervention.STRONG_INTERVENTION: assert timeseries.projections assert timeseries.timeseries elif intervention is Intervention.OBSERVED_INTERVENTION: assert not timeseries.projections assert not timeseries.timeseries
def test_unique_index_values_us_latest(): latest = combined_datasets.build_us_latest_with_all_fields() latest_data = latest.data.set_index(latest.INDEX_FIELDS) duplicates = latest_data.index.duplicated() assert not sum(duplicates)
def _cache_global_datasets(): # Populate cache for combined latest and timeseries. Caching pre-fork # will make sure cache is populated for subprocesses. Return value # is not needed as the only goal is to populate the cache. combined_datasets.build_us_latest_with_all_fields() combined_datasets.build_us_timeseries_with_all_fields()
def test_combined_county_has_some_data(fips): latest = combined_datasets.build_us_latest_with_all_fields().get_subset( AggregationLevel.COUNTY, fips=fips) assert latest.data[CommonFields.POSITIVE_TESTS].all() assert latest.data[CommonFields.NEGATIVE_TESTS].all()
def test_build_summary_for_fips(include_projections, nyc_model_output_path, nyc_fips): us_latest = combined_datasets.build_us_latest_with_all_fields() nyc_latest = us_latest.get_record_for_fips(nyc_fips) model_output = None expected_projections = None intervention = Intervention.OBSERVED_INTERVENTION if include_projections: model_output = CANPyseirLocationOutput.load_from_path( nyc_model_output_path) expected_projections = Projections( totalHospitalBeds=ResourceUsageProjection(peakShortfall=0, peakDate=datetime.date( 2020, 4, 15), shortageStartDate=None), ICUBeds=None, Rt=model_output.latest_rt, RtCI90=model_output.latest_rt_ci90, ) intervention = Intervention.STRONG_INTERVENTION summary = generate_api.generate_region_summary(nyc_latest, model_output) expected = RegionSummary( population=nyc_latest["population"], stateName="New York", countyName="New York County", fips="36061", lat=None, long=None, actuals=Actuals( population=nyc_latest["population"], intervention="STRONG_INTERVENTION", cumulativeConfirmedCases=nyc_latest["cases"], cumulativeDeaths=nyc_latest["deaths"], cumulativePositiveTests=nyc_latest["positive_tests"], cumulativeNegativeTests=nyc_latest["negative_tests"], hospitalBeds={ # Manually calculated from capacity calculation in generate_api.py "capacity": 12763, "totalCapacity": nyc_latest["max_bed_count"], "currentUsageCovid": 0, "currentUsageTotal": None, "typicalUsageRate": nyc_latest["all_beds_occupancy_rate"], }, ICUBeds={ "capacity": nyc_latest["icu_beds"], "totalCapacity": nyc_latest["icu_beds"], "currentUsageCovid": 0, "currentUsageTotal": 0, "typicalUsageRate": nyc_latest["icu_occupancy_rate"], }, contactTracers=nyc_latest["contact_tracers_count"], ), lastUpdatedDate=datetime.datetime.utcnow(), projections=expected_projections, ) import pprint pprint.pprint(expected.actuals.ICUBeds.dict()) pprint.pprint(summary.actuals.ICUBeds.dict()) assert expected.dict() == summary.dict()