def test_update_and_load(tmp_path: pathlib.Path, nyc_fips, nyc_region): us_combined_df = combined_datasets.load_us_timeseries_dataset().combined_df # restricting the datasets being persisted to one county to speed up tests a bit. nyc_combined_df = us_combined_df.loc[us_combined_df[CommonFields.FIPS] == nyc_fips, :] multiregion_timeseries_nyc = MultiRegionTimeseriesDataset.from_combined_dataframe( nyc_combined_df) latest_nyc = LatestValuesDataset( multiregion_timeseries_nyc.latest_data_with_fips.reset_index()) latest_nyc_record = latest_nyc.get_record_for_fips(nyc_fips) assert latest_nyc_record[CommonFields.POPULATION] > 1_000_000 assert latest_nyc_record[CommonFields.LOCATION_ID] combined_dataset_utils.update_data_public_head( tmp_path, latest_dataset=latest_nyc, timeseries_dataset=multiregion_timeseries_nyc, ) timeseries_loaded = combined_datasets.load_us_timeseries_dataset( pointer_directory=tmp_path) latest_loaded = combined_datasets.load_us_latest_dataset( pointer_directory=tmp_path) assert latest_loaded.get_record_for_fips(nyc_fips) == latest_nyc_record assert_combined_like(timeseries_loaded, multiregion_timeseries_nyc)
def _cache_global_datasets(): # Populate cache for combined latest and timeseries. Caching pre-fork # will make sure cache is populated for subprocesses. Return value # is not needed as the only goal is to populate the cache. combined_datasets.load_us_latest_dataset() combined_datasets.load_us_timeseries_dataset() infer_icu.get_region_weight_map()
def test_fips_metadata(nyc_fips): combined_datasets.load_us_timeseries_dataset() fitter = initial_conditions_fitter.InitialConditionsFitter(nyc_fips) assert fitter.state == "NY" assert fitter.county == "New York County" assert fitter.data_start_date == pd.Timestamp("2020-03-01") # Checking to make sure that y is a numpy array rather than a pandas DF. assert isinstance(fitter.y, numpy.ndarray)
def generate_api_v2(model_output_dir, output, aggregation_level, state, fips): """The entry function for invocation""" # Caching load of us timeseries dataset combined_datasets.load_us_timeseries_dataset() active_states = [state.abbr for state in us.STATES] active_states = active_states + ["PR", "MP"] # Load all API Regions regions = combined_datasets.get_subset_regions( aggregation_level=aggregation_level, exclude_county_999=True, state=state, fips=fips, states=active_states, ) _logger.info(f"Loading all regional inputs.") icu_data_path = model_output_dir / SummaryArtifact.ICU_METRIC_COMBINED.value icu_data = MultiRegionTimeseriesDataset.from_csv(icu_data_path) icu_data_map = dict(icu_data.iter_one_regions()) rt_data_path = model_output_dir / SummaryArtifact.RT_METRIC_COMBINED.value rt_data = MultiRegionTimeseriesDataset.from_csv(rt_data_path) rt_data_map = dict(rt_data.iter_one_regions()) regions_data = combined_datasets.load_us_timeseries_dataset( ).get_regions_subset(regions) regional_inputs = [ api_v2_pipeline.RegionalInput.from_one_regions( region, regional_data, icu_data=icu_data_map.get(region), rt_data=rt_data_map.get(region), ) for region, regional_data in regions_data.iter_one_regions() ] _logger.info(f"Finished loading all regional inputs.") # Build all region timeseries API Output objects. _logger.info("Generating all API Timeseries") all_timeseries = api_v2_pipeline.run_on_regions(regional_inputs) api_v2_pipeline.deploy_single_level(all_timeseries, AggregationLevel.COUNTY, output) api_v2_pipeline.deploy_single_level(all_timeseries, AggregationLevel.STATE, output) _logger.info("Finished API generation.")
def generate_api(input_dir, output, summary_output, aggregation_level, state, fips): """The entry function for invocation""" # Caching load of us timeseries dataset combined_datasets.load_us_timeseries_dataset() active_states = [state.abbr for state in us.STATES] active_states = active_states + ["PR", "MP"] regions = combined_datasets.get_subset_regions( aggregation_level=aggregation_level, exclude_county_999=True, state=state, fips=fips, states=active_states, ) icu_data_path = input_dir / SummaryArtifact.ICU_METRIC_COMBINED.value icu_data = MultiRegionTimeseriesDataset.from_csv(icu_data_path) rt_data_path = input_dir / SummaryArtifact.RT_METRIC_COMBINED.value rt_data = MultiRegionTimeseriesDataset.from_csv(rt_data_path) for intervention in list(Intervention): _logger.info(f"Running intervention {intervention.name}") _load_input = functools.partial( api_pipeline.RegionalInput.from_region_and_intervention, intervention=intervention, rt_data=rt_data, icu_data=icu_data, ) with multiprocessing.Pool(maxtasksperchild=1) as pool: regional_inputs = pool.map(_load_input, regions) _logger.info(f"Loaded {len(regional_inputs)} regions.") all_timeseries = api_pipeline.run_on_all_regional_inputs_for_intervention( regional_inputs) county_timeseries = [ output for output in all_timeseries if output.aggregate_level is AggregationLevel.COUNTY ] api_pipeline.deploy_single_level(intervention, county_timeseries, summary_output, output) state_timeseries = [ output for output in all_timeseries if output.aggregate_level is AggregationLevel.STATE ] api_pipeline.deploy_single_level(intervention, state_timeseries, summary_output, output)
def test_generate_timeseries_for_fips(nyc_model_output_path, nyc_region, nyc_rt_dataset, nyc_icu_dataset): us_latest = combined_datasets.load_us_latest_dataset() us_timeseries = combined_datasets.load_us_timeseries_dataset() nyc_latest = us_latest.get_record_for_fips(nyc_region.fips) nyc_timeseries = us_timeseries.get_one_region(nyc_region) metrics_series, latest_metric = api_v2_pipeline.generate_metrics_and_latest( nyc_timeseries, nyc_rt_dataset, nyc_icu_dataset) risk_levels = top_level_metric_risk_levels.calculate_risk_level_from_metrics( latest_metric) region_summary = build_api_v2.build_region_summary(nyc_latest, latest_metric, risk_levels) region_timeseries = build_api_v2.build_region_timeseries( region_summary, nyc_timeseries, metrics_series) summary = build_api_v2.build_region_summary(nyc_latest, latest_metric, risk_levels) assert summary.dict() == region_timeseries.region_summary.dict() # Double checking that serialized json does not contain NaNs, all values should # be serialized using the simplejson wrapper. assert "NaN" not in region_timeseries.json()
def generate_top_counties(disable_validation, input_dir, output, state, fips): """The entry function for invocation""" intervention = Intervention.SELECTED_INTERVENTION active_states = [state.abbr for state in us.STATES] us_latest = combined_datasets.load_us_latest_dataset().get_subset( AggregationLevel.COUNTY, states=active_states, state=state, fips=fips) us_timeseries = combined_datasets.load_us_timeseries_dataset().get_subset( AggregationLevel.COUNTY, states=active_states, state=state, fips=fips) def sort_func(output: RegionSummaryWithTimeseries): return -output.projections.totalHospitalBeds.peakShortfall all_timeseries = api_pipeline.run_on_all_fips_for_intervention( us_latest, us_timeseries, Intervention.SELECTED_INTERVENTION, input_dir, sort_func=sort_func, limit=100, ) bulk_timeseries = AggregateRegionSummaryWithTimeseries( __root__=all_timeseries) api_pipeline.deploy_json_api_output( intervention, bulk_timeseries, output, filename_override="counties_top_100.json")
def run_bad_tails_filter(output_path: pathlib.Path): us_dataset = combined_datasets.load_us_timeseries_dataset() log = structlog.get_logger() log.info("Starting filter") _, dataset_out = TailFilter.run(us_dataset, CUMULATIVE_FIELDS_TO_FILTER) log.info("Writing output") dataset_out.timeseries_rows().to_csv(output_path, index=True, float_format="%.05g")
def run_population_filter(output_path: pathlib.Path): us_timeseries = combined_datasets.load_us_timeseries_dataset() log = structlog.get_logger() log.info("starting filter") ts_out = timeseries.drop_regions_without_population( us_timeseries, KNOWN_LOCATION_ID_WITHOUT_POPULATION, log) ts_out.to_csv(output_path)
def test_nyc_aggregation(nyc_region): dataset = combined_datasets.load_us_timeseries_dataset() data = dataset.get_one_region(nyc_region).latest # Check to make sure that beds occupancy rates are below 1, # signaling that it is properly combining occupancy rates. assert data["all_beds_occupancy_rate"] < 1 assert data["icu_occupancy_rate"] < 1
def generate_api(input_dir, output, summary_output, aggregation_level, state, fips): """The entry function for invocation""" active_states = [state.abbr for state in us.STATES] us_latest = combined_datasets.load_us_latest_dataset().get_subset( aggregation_level, state=state, fips=fips, states=active_states) us_timeseries = combined_datasets.load_us_timeseries_dataset().get_subset( aggregation_level, state=state, fips=fips, states=active_states) for intervention in list(Intervention): _logger.info(f"Running intervention {intervention.name}") all_timeseries = api_pipeline.run_on_all_fips_for_intervention( us_latest, us_timeseries, intervention, input_dir) county_timeseries = [ output for output in all_timeseries if output.aggregate_level is AggregationLevel.COUNTY ] api_pipeline.deploy_single_level(intervention, county_timeseries, summary_output, output) state_timeseries = [ output for output in all_timeseries if output.aggregate_level is AggregationLevel.STATE ] api_pipeline.deploy_single_level(intervention, state_timeseries, summary_output, output)
def test_build_api_output_for_intervention(nyc_fips, nyc_model_output_path, tmp_path): county_output = tmp_path / "county" us_latest = combined_datasets.load_us_latest_dataset() us_timeseries = combined_datasets.load_us_timeseries_dataset() nyc_latest = us_latest.get_subset(None, fips=nyc_fips) nyc_timeseries = us_timeseries.get_subset(None, fips=nyc_fips) all_timeseries_api = api_pipeline.run_on_all_fips_for_intervention( nyc_latest, nyc_timeseries, Intervention.STRONG_INTERVENTION, nyc_model_output_path.parent ) api_pipeline.deploy_single_level( Intervention.STRONG_INTERVENTION, all_timeseries_api, tmp_path, county_output ) expected_outputs = [ "counties.STRONG_INTERVENTION.timeseries.json", "counties.STRONG_INTERVENTION.csv", "counties.STRONG_INTERVENTION.timeseries.csv", "counties.STRONG_INTERVENTION.json", "county/36061.STRONG_INTERVENTION.json", "county/36061.STRONG_INTERVENTION.timeseries.json", ] output_paths = [ str(path.relative_to(tmp_path)) for path in tmp_path.glob("**/*") if not path.is_dir() ] assert sorted(output_paths) == sorted(expected_outputs)
def generate_test_positivity( test_positivity_all_methods: pathlib.Path, final_result: pathlib.Path, output_dir: pathlib.Path, state: Optional[str], fips: Optional[str], ): if state: active_states = [state] else: active_states = [state.abbr for state in us.STATES] active_states = active_states + ["PR", "MP"] selected_dataset = combined_datasets.load_us_timeseries_dataset( ).get_subset( exclude_county_999=True, states=active_states, fips=fips, ) test_positivity_results = test_positivity.AllMethods.run(selected_dataset) _write_dataset_map(output_dir / test_positivity_all_methods, test_positivity_results.all_methods_datasets) test_positivity_results.test_positivity.timeseries_rows().to_csv( output_dir / final_result, index=True, float_format="%.05g")
def test_generate_timeseries_for_fips( include_projections, nyc_model_output_path, nyc_region, nyc_rt_dataset, nyc_icu_dataset, ): us_latest = combined_datasets.load_us_latest_dataset() us_timeseries = combined_datasets.load_us_timeseries_dataset() nyc_latest = us_latest.get_record_for_fips(nyc_region.fips) nyc_timeseries = us_timeseries.get_one_region(nyc_region) intervention = Intervention.OBSERVED_INTERVENTION model_output = CANPyseirLocationOutput.load_from_path( nyc_model_output_path) metrics_series, latest_metric = api_pipeline.generate_metrics_and_latest( nyc_timeseries, nyc_rt_dataset, nyc_icu_dataset) region_summary = generate_api.generate_region_summary( nyc_latest, latest_metric, model_output) region_timeseries = generate_api.generate_region_timeseries( region_summary, nyc_timeseries, metrics_series, model_output) summary = generate_api.generate_region_summary(nyc_latest, latest_metric, model_output) assert summary.dict() == region_timeseries.region_summary.dict() # Double checking that serialized json does not contain NaNs, all values should # be serialized using the simplejson wrapper. assert "NaN" not in region_timeseries.json()
def build_all(states, output_dir, level, fips, location_id_matches: str, generate_api_v2: bool): # split columns by ',' and remove whitespace states = [c.strip() for c in states] states = [us.states.lookup(state).abbr for state in states] states = [state for state in states if state in ALL_STATES] # prepare data _cache_global_datasets() regions_dataset = combined_datasets.load_us_timeseries_dataset( ).get_subset( fips=fips, aggregation_level=level, exclude_county_999=True, states=states, location_id_matches=location_id_matches, ) regions = [ one_region for _, one_region in regions_dataset.iter_one_regions() ] root.info(f"Executing pipeline for {len(regions)} regions") region_pipelines: List[OneRegionPipeline] = parallel_utils.parallel_map( OneRegionPipeline.run, regions) region_pipelines = _patch_nola_infection_rate_in_pipelines( region_pipelines) model_output = pyseir.run.PyseirOutputDatasets.from_pipeline_output( region_pipelines) model_output.write(output_dir, root) if generate_api_v2: api_v2_pipeline.generate_from_loaded_data(model_output, output_dir, regions_dataset, root)
def test_combined_county_has_some_data(fips): region_data = combined_datasets.load_us_timeseries_dataset().get_one_region( Region.from_fips(fips) ) assert region_data.data[CommonFields.POSITIVE_TESTS].all() assert region_data.data[CommonFields.NEGATIVE_TESTS].all() assert region_data.latest[CommonFields.DEATHS] > 1
def save_combined_csv(csv_path_format, output_dir): """Save the combined datasets DataFrame, cleaned up for easier comparisons.""" csv_path = form_path_name(csv_path_format, output_dir) timeseries = combined_datasets.load_us_timeseries_dataset() timeseries_data = timeseries.data common_df.write_csv(timeseries_data, csv_path, structlog.get_logger())
def test_build_summary_for_fips(include_model_output, rt_null, nyc_region, nyc_icu_dataset, nyc_rt_dataset): us_latest = combined_datasets.load_us_latest_dataset() us_timeseries = combined_datasets.load_us_timeseries_dataset() nyc_latest = us_latest.get_record_for_fips(nyc_region.fips) model_output = None expected_projections = None if include_model_output: if rt_null: nyc_rt_dataset = None else: nyc_icu_dataset = None nyc_rt_dataset = None fips_timeseries = us_timeseries.get_one_region(nyc_region) metrics_series, latest_metric = api_v2_pipeline.generate_metrics_and_latest( fips_timeseries, nyc_rt_dataset, nyc_icu_dataset) risk_levels = top_level_metric_risk_levels.calculate_risk_level_from_metrics( latest_metric) assert latest_metric summary = build_api_v2.build_region_summary(nyc_latest, latest_metric, risk_levels) expected = RegionSummary( population=nyc_latest["population"], state="NY", country="USA", level="county", county="New York County", fips="36061", lat=None, long=None, metrics=latest_metric, riskLevels=risk_levels, actuals=Actuals( cases=nyc_latest["cases"], deaths=nyc_latest["deaths"], positiveTests=nyc_latest["positive_tests"], negativeTests=nyc_latest["negative_tests"], hospitalBeds={ "capacity": nyc_latest["max_bed_count"], "currentUsageCovid": None, "currentUsageTotal": None, "typicalUsageRate": nyc_latest["all_beds_occupancy_rate"], }, icuBeds={ "capacity": nyc_latest["icu_beds"], "totalCapacity": nyc_latest["icu_beds"], "currentUsageCovid": None, "currentUsageTotal": None, "typicalUsageRate": nyc_latest["icu_occupancy_rate"], }, contactTracers=nyc_latest["contact_tracers_count"], ), lastUpdatedDate=datetime.datetime.utcnow(), ) assert expected.dict() == summary.dict()
def il_regional_input(rt_dataset, icu_dataset): region = Region.from_state("IL") regional_data = combined_datasets.load_us_timeseries_dataset( ).get_regions_subset([region]) regional_data = test_positivity.run_and_maybe_join_columns( regional_data, structlog.get_logger()) return api_v2_pipeline.RegionalInput.from_region_and_model_output( region, regional_data, rt_dataset, icu_dataset)
def il_regional_input(rt_dataset, icu_dataset): region = Region.from_state("IL") regional_data = combined_datasets.load_us_timeseries_dataset( ).get_regions_subset([region]) # TODO(tom): add test positivity back in after PR 728 is merged. # test_positivity_results = test_positivity.AllMethods.run(regional_data) # regional_data = regional_data.join_columns(test_positivity_results.test_positivity) return api_v2_pipeline.RegionalInput.from_region_and_model_output( region, regional_data, rt_dataset, icu_dataset)
def test_update_and_load(tmp_path: pathlib.Path, nyc_fips, nyc_region): # restricting the datasets being persisted to one county to speed up tests a bit. multiregion_timeseries_nyc = combined_datasets.load_us_timeseries_dataset().get_regions_subset( [nyc_region] ) one_region_nyc = multiregion_timeseries_nyc.get_one_region(nyc_region) assert one_region_nyc.latest[CommonFields.POPULATION] > 1_000_000 assert one_region_nyc.region.location_id combined_dataset_utils.persist_dataset( multiregion_timeseries_nyc, tmp_path, ) timeseries_loaded = combined_datasets.load_us_timeseries_dataset(pointer_directory=tmp_path) one_region_loaded = timeseries_loaded.get_one_region(nyc_region) assert one_region_nyc.latest == pytest.approx(one_region_loaded.latest) test_helpers.assert_dataset_like( timeseries_loaded, multiregion_timeseries_nyc, drop_na_timeseries=True )
def generate_test_positivity(test_positivity_all_methods: pathlib.Path): active_states = [state.abbr for state in us.STATES] active_states = active_states + ["PR", "MP"] regions = combined_datasets.get_subset_regions( exclude_county_999=True, states=active_states, ) regions_data = combined_datasets.load_us_timeseries_dataset( ).get_regions_subset(regions) test_positivity_results = test_positivity.AllMethods.run(regions_data) test_positivity_results.write(test_positivity_all_methods)
def test_persist_and_load_dataset(tmp_path, nyc_fips): dataset = combined_datasets.load_us_timeseries_dataset() timeseries_nyc = dataset.get_subset(None, fips=nyc_fips) pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path) downloaded_dataset = pointer.load_dataset() differ_l = DatasetDiff.make(downloaded_dataset.data) differ_r = DatasetDiff.make(timeseries_nyc.data) differ_l.compare(differ_r) assert not len(differ_l.my_ts)
def test_combined_county_has_some_timeseries_data(fips): region = Region.from_fips(fips) latest = combined_datasets.load_us_timeseries_dataset().get_one_region( region) df = latest.data.set_index(CommonFields.DATE) assert df.loc["2020-05-01", CommonFields.CASES] > 0 assert df.loc["2020-05-01", CommonFields.DEATHS] > 0 if fips.startswith( "06" ): # TODO(tom): Remove this condition when we have county data in TX too. assert df.loc["2020-05-01", CommonFields.POSITIVE_TESTS] > 0 assert df.loc["2020-05-01", CommonFields.NEGATIVE_TESTS] > 0 assert df.loc["2020-05-01", CommonFields.CURRENT_ICU] > 0
def test_persist_and_load_dataset(tmp_path, nyc_fips): region = Region.from_fips(nyc_fips) dataset = combined_datasets.load_us_timeseries_dataset() timeseries_nyc = TimeseriesDataset(dataset.get_one_region(region).data) pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path) downloaded_dataset = pointer.load_dataset() differ_l = DatasetDiff.make(downloaded_dataset.data) differ_r = DatasetDiff.make(timeseries_nyc.data) differ_l.compare(differ_r) assert not len(differ_l.my_ts)
def test_update_and_load(tmp_path: pathlib.Path, nyc_fips): latest = combined_datasets.load_us_latest_dataset() timeseries_dataset = combined_datasets.load_us_timeseries_dataset() # restricting the datasets being persisted to one county to speed up tests a bit. latest_nyc = latest.get_subset(None, fips=nyc_fips) timeseries_nyc = timeseries_dataset.get_subset(None, fips=nyc_fips) combined_dataset_utils.update_data_public_head( tmp_path, latest_dataset=latest_nyc, timeseries_dataset=timeseries_nyc, ) timeseries = combined_datasets.load_us_timeseries_dataset( pointer_directory=tmp_path) latest = combined_datasets.load_us_latest_dataset( pointer_directory=tmp_path) assert latest.get_record_for_fips( nyc_fips) == latest_nyc.get_record_for_fips(nyc_fips)
def test_persist_and_load_dataset(tmp_path, nyc_fips): region = Region.from_fips(nyc_fips) dataset = combined_datasets.load_us_timeseries_dataset() timeseries_nyc = dataset.get_regions_subset([region]) pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path) downloaded_dataset = MultiRegionDataset.read_from_pointer(pointer) differ_l = DatasetDiff.make(downloaded_dataset.timeseries) differ_r = DatasetDiff.make(timeseries_nyc.timeseries) differ_l.compare(differ_r) assert not len(differ_l.my_ts)
def test_output_no_timeseries_rows(nyc_regional_input, tmp_path): # Creating a new regional input with an empty timeseries dataset timeseries = nyc_regional_input.timeseries one_region = combined_datasets.load_us_timeseries_dataset().get_one_region( nyc_regional_input.region) regional_input = api_v2_pipeline.RegionalInput(nyc_regional_input.region, one_region, None, None) assert not regional_input.timeseries.empty all_timeseries_api = api_v2_pipeline.run_on_regions([regional_input]) assert all_timeseries_api
def il_regional_input_empty_test_positivity_column(rt_dataset, icu_dataset): region = Region.from_state("IL") regional_data = combined_datasets.load_us_timeseries_dataset( ).get_regions_subset([region]) empty_test_positivity = timeseries.MultiRegionTimeseriesDataset.from_timeseries_df( pd.DataFrame([], columns=[ CommonFields.LOCATION_ID, CommonFields.DATE, CommonFields.TEST_POSITIVITY ])) regional_data = regional_data.join_columns(empty_test_positivity) return api_v2_pipeline.RegionalInput.from_region_and_model_output( region, regional_data, rt_dataset, icu_dataset)
def calculate_case_based_weights() -> dict: LOOKBACK_DAYS = 31 SUMMED_CASES_LABEL = "summed_cases" cutoff_date = pd.Timestamp.now() - pd.Timedelta(days=LOOKBACK_DAYS) us_dataset = combined_datasets.load_us_timeseries_dataset() region_groupby = us_dataset.get_counties_and_places( after=cutoff_date).groupby_region() last_month_cum_cases = region_groupby[CommonFields.CASES].apply( _quantile_range) last_month_cum_cases.name = SUMMED_CASES_LABEL df = last_month_cum_cases.reset_index().dropna() timeseries._add_fips_if_missing(df) # Example location_id value = 'iso1:us#iso2:us-ak#fips:02013' df["state_location_id"] = df[CommonFields.LOCATION_ID.value].str.split( "#").str[1] # Normalize the cases based on the groupby total df["weight"] = df.groupby("state_location_id")[ SUMMED_CASES_LABEL].transform(lambda x: x / x.sum()) df["weight"] = df["weight"].round(4) # Convert to dict mapping output = df.set_index(CommonFields.FIPS.value)["weight"].to_dict() # Set the default weight to 0 for the few counties with no cases in the window of interest all_county_fips = { region.fips for region, _ in combined_datasets.load_us_timeseries_dataset(). get_subset(aggregation_level=AggregationLevel.COUNTY, exclude_county_999=True).iter_one_regions() } for fips in all_county_fips: if fips not in output: output[fips] = 0 return output