def from_region_and_model_output( region: pipeline.Region, combined_data: MultiRegionTimeseriesDataset, rt_data: MultiRegionTimeseriesDataset, icu_data: MultiRegionTimeseriesDataset, ) -> "RegionalInput": one_region_data = combined_data.get_one_region(region) # Not all regions have Rt or ICU data due to various filters in pyseir code. try: rt_data = rt_data.get_one_region(region) except timeseries.RegionLatestNotFound: rt_data = None try: icu_data = icu_data.get_one_region(region) except timeseries.RegionLatestNotFound: icu_data = None return RegionalInput( region=region, _combined_data=one_region_data, rt_data=rt_data, icu_data=icu_data, )
def from_region_and_intervention( region: pipeline.Region, intervention: Intervention, rt_data: MultiRegionTimeseriesDataset, icu_data: MultiRegionTimeseriesDataset, ) -> "RegionalInput": combined_data = combined_datasets.RegionalData.from_region(region) try: rt_data = rt_data.get_one_region(region) except timeseries.RegionLatestNotFound: rt_data = None try: icu_data = icu_data.get_one_region(region) except timeseries.RegionLatestNotFound: icu_data = None return RegionalInput( region=region, model_output=None, intervention=intervention, _combined_data=combined_data, rt_data=rt_data, icu_data=icu_data, )
def generate_api_v2(model_output_dir, output, aggregation_level, state, fips): """The entry function for invocation""" # Caching load of us timeseries dataset combined_datasets.load_us_timeseries_dataset() active_states = [state.abbr for state in us.STATES] active_states = active_states + ["PR", "MP"] # Load all API Regions regions = combined_datasets.get_subset_regions( aggregation_level=aggregation_level, exclude_county_999=True, state=state, fips=fips, states=active_states, ) _logger.info(f"Loading all regional inputs.") icu_data_path = model_output_dir / SummaryArtifact.ICU_METRIC_COMBINED.value icu_data = MultiRegionTimeseriesDataset.from_csv(icu_data_path) icu_data_map = dict(icu_data.iter_one_regions()) rt_data_path = model_output_dir / SummaryArtifact.RT_METRIC_COMBINED.value rt_data = MultiRegionTimeseriesDataset.from_csv(rt_data_path) rt_data_map = dict(rt_data.iter_one_regions()) regions_data = combined_datasets.load_us_timeseries_dataset( ).get_regions_subset(regions) regional_inputs = [ api_v2_pipeline.RegionalInput.from_one_regions( region, regional_data, icu_data=icu_data_map.get(region), rt_data=rt_data_map.get(region), ) for region, regional_data in regions_data.iter_one_regions() ] _logger.info(f"Finished loading all regional inputs.") # Build all region timeseries API Output objects. _logger.info("Generating all API Timeseries") all_timeseries = api_v2_pipeline.run_on_regions(regional_inputs) api_v2_pipeline.deploy_single_level(all_timeseries, AggregationLevel.COUNTY, output) api_v2_pipeline.deploy_single_level(all_timeseries, AggregationLevel.STATE, output) _logger.info("Finished API generation.")
def generate_api(input_dir, output, summary_output, aggregation_level, state, fips): """The entry function for invocation""" # Caching load of us timeseries dataset combined_datasets.load_us_timeseries_dataset() active_states = [state.abbr for state in us.STATES] active_states = active_states + ["PR", "MP"] regions = combined_datasets.get_subset_regions( aggregation_level=aggregation_level, exclude_county_999=True, state=state, fips=fips, states=active_states, ) icu_data_path = input_dir / SummaryArtifact.ICU_METRIC_COMBINED.value icu_data = MultiRegionTimeseriesDataset.from_csv(icu_data_path) rt_data_path = input_dir / SummaryArtifact.RT_METRIC_COMBINED.value rt_data = MultiRegionTimeseriesDataset.from_csv(rt_data_path) for intervention in list(Intervention): _logger.info(f"Running intervention {intervention.name}") _load_input = functools.partial( api_pipeline.RegionalInput.from_region_and_intervention, intervention=intervention, rt_data=rt_data, icu_data=icu_data, ) with multiprocessing.Pool(maxtasksperchild=1) as pool: regional_inputs = pool.map(_load_input, regions) _logger.info(f"Loaded {len(regional_inputs)} regions.") all_timeseries = api_pipeline.run_on_all_regional_inputs_for_intervention( regional_inputs) county_timeseries = [ output for output in all_timeseries if output.aggregate_level is AggregationLevel.COUNTY ] api_pipeline.deploy_single_level(intervention, county_timeseries, summary_output, output) state_timeseries = [ output for output in all_timeseries if output.aggregate_level is AggregationLevel.STATE ] api_pipeline.deploy_single_level(intervention, state_timeseries, summary_output, output)
def test_pyseir_end_to_end_idaho(tmp_path): # This covers a lot of edge cases. with unittest.mock.patch("pyseir.utils.OUTPUT_DIR", str(tmp_path)): fips = "16001" region = Region.from_fips(fips) pipelines = cli._build_all_for_states(states=["ID"], fips=fips) cli._write_pipeline_output(pipelines, tmp_path) icu_data_path = tmp_path / SummaryArtifact.ICU_METRIC_COMBINED.value icu_data = MultiRegionTimeseriesDataset.from_csv(icu_data_path) assert icu_data.get_one_region(region) rt_data_path = tmp_path / SummaryArtifact.RT_METRIC_COMBINED.value rt_data = MultiRegionTimeseriesDataset.from_csv(rt_data_path) assert rt_data.get_one_region(region)
def _fips_csv_to_one_region(csv_str: str, region: Region) -> OneRegionTimeseriesDataset: # Make a Timeseries first because it can have a FIPS column without location_id ts = TimeseriesDataset.load_csv(io.StringIO(csv_str)) # from_timeseries_and_latest adds the location_id column needed by get_one_region return MultiRegionTimeseriesDataset.from_timeseries_and_latest( ts, ts.latest_values_object()).get_one_region(region)
def test_inference_ok_with_5_days_cases_changed(): # 5 days with cases data isn't enough to make inference_ok, 6 days are # needed so that there are 5 days with an *delta* relative to a previous day. csv_string_io = io.StringIO( "location_id,country,state,county,aggregate_level,date,cases,deaths\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-01,100,1\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-02,200,2\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-03,300,3\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-04,400,4\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-05,500,5\n" "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-01,100,1\n" "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-02,200,2\n" "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-03,300,3\n" "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-04,400,4\n" "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-05,500,5\n" "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-06,600,6\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,,500,5\n" "iso1:us#fips:97222,US,ZZ,Foo County,county,,100,1\n") input_dataset = MultiRegionTimeseriesDataset.from_csv(csv_string_io) df = WhitelistGenerator().generate_whitelist(input_dataset) assert to_dict(["fips"], df) == { "97111": { "state": "ZZ", "county": "Bar County", "inference_ok": False }, "97222": { "state": "ZZ", "county": "Foo County", "inference_ok": True }, }
def test_update_and_load(tmp_path: pathlib.Path, nyc_fips, nyc_region): us_combined_df = combined_datasets.load_us_timeseries_dataset().combined_df # restricting the datasets being persisted to one county to speed up tests a bit. nyc_combined_df = us_combined_df.loc[us_combined_df[CommonFields.FIPS] == nyc_fips, :] multiregion_timeseries_nyc = MultiRegionTimeseriesDataset.from_combined_dataframe( nyc_combined_df) latest_nyc = LatestValuesDataset( multiregion_timeseries_nyc.latest_data_with_fips.reset_index()) latest_nyc_record = latest_nyc.get_record_for_fips(nyc_fips) assert latest_nyc_record[CommonFields.POPULATION] > 1_000_000 assert latest_nyc_record[CommonFields.LOCATION_ID] combined_dataset_utils.update_data_public_head( tmp_path, latest_dataset=latest_nyc, timeseries_dataset=multiregion_timeseries_nyc, ) timeseries_loaded = combined_datasets.load_us_timeseries_dataset( pointer_directory=tmp_path) latest_loaded = combined_datasets.load_us_latest_dataset( pointer_directory=tmp_path) assert latest_loaded.get_record_for_fips(nyc_fips) == latest_nyc_record assert_combined_like(timeseries_loaded, multiregion_timeseries_nyc)
def test_aggregate(): ts = TimeseriesDataset.load_csv( io.StringIO("fips,state,aggregate_level,county,m1,date,foo\n" "55005,ZZ,county,North County,1,2020-05-01,ab\n" "55005,ZZ,county,North County,2,2020-05-02,cd\n" "55005,ZZ,county,North County,3,2020-05-03,ef\n" "55006,ZZ,county,South County,3,2020-05-03,ef\n" "55006,ZZ,county,South County,4,2020-05-04,gh\n" "55,ZZ,state,Grand State,41,2020-05-01,ij\n" "55,ZZ,state,Grand State,43,2020-05-03,kl\n")) ts_in = MultiRegionTimeseriesDataset.from_timeseries_and_latest( ts, ts.latest_values_object()) agg = statistical_areas.CountyToCBSAAggregator( county_map={ "55005": "10001", "55006": "10001" }, cbsa_title_map={"10001": "Stat Area 1"}) ts_out = agg.aggregate(ts_in) assert ts_out.groupby_region().ngroups == 1 ts_cbsa = ts_out.get_one_region(Region.from_cbsa_code("10001")) assert ts_cbsa.date_indexed["m1"].to_dict() == { pd.to_datetime("2020-05-01"): 1, pd.to_datetime("2020-05-02"): 2, pd.to_datetime("2020-05-03"): 6, pd.to_datetime("2020-05-04"): 4, }
def generate_whitelist( self, timeseries: MultiRegionTimeseriesDataset) -> pd.DataFrame: """ Generate a county whitelist based on the cuts above. Returns ------- df: whitelist """ logging.info("Generating county level whitelist...") df_candidates = ( timeseries.get_counties().groupby_region() # Use pandarallel. It doesn't support the `name` attribute so leave FIPS as a regular # column so it can be read in the applied function. .parallel_apply(_whitelist_candidates_per_fips).reset_index( drop=True)) df_candidates["inference_ok"] = ( (df_candidates.nonzero_case_datapoints >= self.nonzero_case_datapoints) & (df_candidates.nonzero_death_datapoints >= self.nonzero_death_datapoints) & (df_candidates.total_cases >= self.total_cases) & (df_candidates.total_deaths >= self.total_deaths)) whitelist_df = df_candidates[[ "fips", "state", "county", "inference_ok" ]] return whitelist_df
def _write_pipeline_output( pipelines: List[Union[SubStatePipeline, StatePipeline]], output_dir: str, output_interval_days: int = 4, write_webui_output: bool = False, ): infection_rate_metric_df = pd.concat((p.infer_df for p in pipelines), ignore_index=True) # TODO: Use constructors in MultiRegionTimeseriesDataset timeseries_dataset = TimeseriesDataset(infection_rate_metric_df) latest = timeseries_dataset.latest_values_object() multiregion_rt = MultiRegionTimeseriesDataset.from_timeseries_and_latest( timeseries_dataset, latest) output_path = pathlib.Path( output_dir) / pyseir.utils.SummaryArtifact.RT_METRIC_COMBINED.value multiregion_rt.to_csv(output_path) root.info(f"Saving Rt results to {output_path}") icu_df = pd.concat((p.icu_data.data for p in pipelines if p.icu_data), ignore_index=True) timeseries_dataset = TimeseriesDataset(icu_df) latest = timeseries_dataset.latest_values_object().data.set_index( CommonFields.LOCATION_ID) multiregion_icu = MultiRegionTimeseriesDataset(icu_df, latest) output_path = pathlib.Path( output_dir) / pyseir.utils.SummaryArtifact.ICU_METRIC_COMBINED.value multiregion_icu.to_csv(output_path) root.info(f"Saving ICU results to {output_path}") if write_webui_output: # does not parallelize well, because web_ui mapper doesn't serialize efficiently # TODO: Remove intermediate artifacts and paralellize artifacts creation better # Approximately 40% of the processing time is taken on this step web_ui_mapper = WebUIDataAdaptorV1( output_interval_days=output_interval_days, output_dir=output_dir, ) webui_inputs = [ webui_data_adaptor_v1.RegionalInput.from_results( p.fitter, p.ensemble, p.infer_df) for p in pipelines if p.fitter ] with Pool(maxtasksperchild=1) as p: p.map(web_ui_mapper.write_region_safely, webui_inputs)
def update(summary_filename, wide_dates_filename, aggregate_to_msas: bool): """Updates latest and timeseries datasets to the current checked out covid data public commit""" path_prefix = dataset_utils.DATA_DIRECTORY.relative_to( dataset_utils.REPO_ROOT) data_source_classes = set( chain( chain.from_iterable(ALL_FIELDS_FEATURE_DEFINITION.values()), chain.from_iterable(ALL_TIMESERIES_FEATURE_DEFINITION.values()), )) data_sources = { data_source_cls.SOURCE_NAME: data_source_cls.local() for data_source_cls in data_source_classes } timeseries_dataset: TimeseriesDataset = combined_datasets.build_from_sources( TimeseriesDataset, data_sources, ALL_TIMESERIES_FEATURE_DEFINITION, filter=US_STATES_FILTER) latest_dataset: LatestValuesDataset = combined_datasets.build_from_sources( LatestValuesDataset, data_sources, ALL_FIELDS_FEATURE_DEFINITION, filter=US_STATES_FILTER, ) multiregion_dataset = MultiRegionTimeseriesDataset.from_timeseries_and_latest( timeseries_dataset, latest_dataset) multiregion_dataset = add_new_cases(multiregion_dataset) if aggregate_to_msas: aggregator = statistical_areas.CountyToCBSAAggregator.from_local_public_data( ) multiregion_dataset = multiregion_dataset.append_regions( aggregator.aggregate(multiregion_dataset)) _, multiregion_pointer = combined_dataset_utils.update_data_public_head( path_prefix, latest_dataset, multiregion_dataset, ) # Write DataSource objects that have provenance information, which is only set when significant # processing of the source data is done in this repo before it is combined. The output is not # used downstream, it is for debugging only. for data_source in data_sources.values(): if data_source.provenance is not None: wide_dates_df.write_csv( data_source.timeseries().get_date_columns(), path_prefix / f"{data_source.SOURCE_NAME}-wide-dates.csv", ) if wide_dates_filename: wide_dates_df.write_csv( timeseries_dataset.get_date_columns(), multiregion_pointer.path.with_name(wide_dates_filename), ) if summary_filename: _save_field_summary(multiregion_dataset, path_prefix / summary_filename)
def aggregate(self, dataset_in: MultiRegionTimeseriesDataset) -> MultiRegionTimeseriesDataset: """Returns a dataset of CBSA regions, created by aggregating counties in the input data.""" return MultiRegionTimeseriesDataset.from_combined_dataframe( pd.concat( [ self._aggregate_fips_df(dataset_in.data_with_fips, groupby_date=True), # No need to reset latest_data_with_fips LOCATION_ID index because FIPS is used. self._aggregate_fips_df(dataset_in.latest_data_with_fips, groupby_date=False), ], ignore_index=True, ) )
def test_load_from_local_public_data(): agg = statistical_areas.CountyToCBSAAggregator.from_local_public_data() assert agg.cbsa_title_map["43580"] == "Sioux City, IA-NE-SD" assert agg.county_map["48187"] == "41700" ts = TimeseriesDataset.load_csv( io.StringIO("fips,state,aggregate_level,county,m1,date,foo\n" "48059,ZZ,county,North County,3,2020-05-03,ef\n" "48253,ZZ,county,South County,4,2020-05-03,ef\n")) ts_in = MultiRegionTimeseriesDataset.from_timeseries_and_latest( ts, ts.latest_values_object()) ts_out = agg.aggregate(ts_in) ts_cbsa = ts_out.get_one_region(Region.from_cbsa_code("10180")) assert ts_cbsa.date_indexed["m1"].to_dict() == { pd.to_datetime("2020-05-03"): 7, }
def test_skip_gaps_in_cases_and_deaths_metrics(): csv_string_io = io.StringIO( "location_id,country,state,county,aggregate_level,date,cases,deaths\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-01,10,1\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-02,,2\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-03,30,\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-04,40,4\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,,40,4\n") input_dataset = MultiRegionTimeseriesDataset.from_csv(csv_string_io) df = WhitelistGenerator().generate_whitelist(input_dataset) assert to_dict(["fips"], df) == { "97111": { "state": "ZZ", "county": "Bar County", "inference_ok": False }, }
def run( metrics_in: MultiRegionTimeseriesDataset, methods: Sequence[Method] = TEST_POSITIVITY_METHODS, diff_days: int = 7, recent_days: int = 14, ) -> "AllMethods": ts_value_cols = list( set( chain.from_iterable((method.numerator, method.denominator) for method in methods))) missing_columns = set(ts_value_cols) - set(metrics_in.data.columns) if missing_columns: raise AssertionError( f"Data missing for test positivity: {missing_columns}") input_long = metrics_in.timeseries_long(ts_value_cols).set_index( [PdFields.VARIABLE, CommonFields.LOCATION_ID, CommonFields.DATE])[PdFields.VALUE] dates = input_long.index.get_level_values(CommonFields.DATE) start_date = dates.min() end_date = dates.max() input_date_range = pd.date_range(start=start_date, end=end_date) recent_date_range = pd.date_range( end=end_date, periods=recent_days).intersection(input_date_range) input_wide = (input_long.unstack(CommonFields.DATE).reindex( columns=input_date_range).rename_axis(columns=CommonFields.DATE)) # This calculates the difference only when the cumulative value is a real value `diff_days` apart. # It looks like our input data has few or no holes so this works well enough. diff_df = input_wide.diff(periods=diff_days, axis=1) all_wide = ( pd.concat( {method.name: method.calculate(diff_df) for method in methods}, names=[PdFields.VARIABLE], ).reorder_levels([CommonFields.LOCATION_ID, PdFields.VARIABLE]) # Drop empty timeseries .dropna("index", "all").sort_index()) method_cat_type = pd.CategoricalDtype( categories=[method.name for method in methods], ordered=True) has_recent_data = all_wide.loc[:, recent_date_range].notna().any(axis=1) all_recent_data = all_wide.loc[has_recent_data, :].reset_index() all_recent_data[PdFields.VARIABLE] = all_recent_data[ PdFields.VARIABLE].astype(method_cat_type) first = all_recent_data.groupby(CommonFields.LOCATION_ID).first() provenance = first[PdFields.VARIABLE].astype(str).rename( PdFields.PROVENANCE) provenance.index = pd.MultiIndex.from_product( [provenance.index, [CommonFields.TEST_POSITIVITY]], names=[CommonFields.LOCATION_ID, PdFields.VARIABLE], ) positivity = first.drop(columns=[PdFields.VARIABLE]) test_positivity = MultiRegionTimeseriesDataset.from_timeseries_df( positivity.stack().rename( CommonFields.TEST_POSITIVITY).reset_index(), provenance=provenance, ) return AllMethods(all_methods_timeseries=all_wide, test_positivity=test_positivity)