def get_timeseries_for_state( state: str, columns: List = None, min_range_with_some_value: bool = False) -> TimeseriesDataset: """Gets timeseries for a specific state abbreviation. Args: state: 2-letter state code columns: List of columns, apart from `TimeseriesDataset.INDEX_FIELDS`, to include. min_range_with_some_value: If True, removes NaNs that pad values at beginning and end of timeseries. Only applicable when columns are specified. Returns: Timeseries for state """ state_ts = load_us_timeseries_dataset().get_subset(AggregationLevel.STATE, state=state) if columns: subset = state_ts.data.loc[:, TimeseriesDataset.INDEX_FIELDS + columns].reset_index(drop=True) if min_range_with_some_value: subset = _remove_padded_nans(subset, columns) state_ts = TimeseriesDataset(subset) return state_ts
def get_timeseries_for_fips( fips: str, columns: List = None, min_range_with_some_value: bool = False) -> TimeseriesDataset: """Gets timeseries for a specific FIPS code. Args: fips: FIPS code. Can be county (5 character) or state (2 character) code. columns: List of columns, apart from `TimeseriesDataset.INDEX_FIELDS`, to include. min_range_with_some_value: If True, removes NaNs that pad values at beginning and end of timeseries. Only applicable when columns are specified. Returns: Timeseries for fips """ state_ts = load_us_timeseries_dataset().get_subset(None, fips=fips) if columns: subset = state_ts.data.loc[:, TimeseriesDataset.INDEX_FIELDS + columns].reset_index(drop=True) if min_range_with_some_value: subset = _remove_padded_nans(subset, columns) state_ts = TimeseriesDataset(subset) return state_ts
def _write_pipeline_output( pipelines: List[Union[SubStatePipeline, StatePipeline]], output_dir: str, output_interval_days: int = 4, write_webui_output: bool = False, ): infection_rate_metric_df = pd.concat((p.infer_df for p in pipelines), ignore_index=True) # TODO: Use constructors in MultiRegionTimeseriesDataset timeseries_dataset = TimeseriesDataset(infection_rate_metric_df) latest = timeseries_dataset.latest_values_object() multiregion_rt = MultiRegionTimeseriesDataset.from_timeseries_and_latest( timeseries_dataset, latest) output_path = pathlib.Path( output_dir) / pyseir.utils.SummaryArtifact.RT_METRIC_COMBINED.value multiregion_rt.to_csv(output_path) root.info(f"Saving Rt results to {output_path}") icu_df = pd.concat((p.icu_data.data for p in pipelines if p.icu_data), ignore_index=True) timeseries_dataset = TimeseriesDataset(icu_df) latest = timeseries_dataset.latest_values_object().data.set_index( CommonFields.LOCATION_ID) multiregion_icu = MultiRegionTimeseriesDataset(icu_df, latest) output_path = pathlib.Path( output_dir) / pyseir.utils.SummaryArtifact.ICU_METRIC_COMBINED.value multiregion_icu.to_csv(output_path) root.info(f"Saving ICU results to {output_path}") if write_webui_output: # does not parallelize well, because web_ui mapper doesn't serialize efficiently # TODO: Remove intermediate artifacts and paralellize artifacts creation better # Approximately 40% of the processing time is taken on this step web_ui_mapper = WebUIDataAdaptorV1( output_interval_days=output_interval_days, output_dir=output_dir, ) webui_inputs = [ webui_data_adaptor_v1.RegionalInput.from_results( p.fitter, p.ensemble, p.infer_df) for p in pipelines if p.fitter ] with Pool(maxtasksperchild=1) as p: p.map(web_ui_mapper.write_region_safely, webui_inputs)
def load_combined_timeseries( sources: Dict[str, TimeseriesDataset], timeseries: TimeseriesDataset) -> TimeseriesDataset: timeseries_data = timeseries.data.copy() timeseries_data["source"] = "Combined Data" combined_timeseries = TimeseriesDataset( pd.concat([timeseries_data] + [source.data for source in sources.values()])) return combined_timeseries
def get_hospitalization_data(): data = combined_datasets.build_us_timeseries_with_all_fields().data # Since we're using this data for hospitalized data only, only returning # values with hospitalization data. I think as the use cases of this data source # expand, we may not want to drop. For context, as of 4/8 607/1821 rows contained # hospitalization data. has_current_hospital = data[ TimeseriesDataset.Fields.CURRENT_HOSPITALIZED].notnull() has_cumulative_hospital = data[ TimeseriesDataset.Fields.CUMULATIVE_HOSPITALIZED].notnull() return TimeseriesDataset(data[has_current_hospital | has_cumulative_hospital])
def test_make_latest_from_timeseries_simple(): data = read_csv_and_index_fips_date( "fips,county,state,country,date,aggregate_level,m1,m2\n" "97123,Smith County,ZZ,USA,2020-04-01,county,1,\n" "97123,Smith County,ZZ,USA,2020-04-02,county,,2\n").reset_index() ts = TimeseriesDataset(data) assert to_dict(["fips"], ts.latest_values()[["fips", "m1", "m2"]]) == { "97123": { "m1": 1, "m2": 2 } }
def test_persist_and_load_dataset(tmp_path, nyc_fips): region = Region.from_fips(nyc_fips) dataset = combined_datasets.load_us_timeseries_dataset() timeseries_nyc = TimeseriesDataset(dataset.get_one_region(region).data) pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path) downloaded_dataset = pointer.load_dataset() differ_l = DatasetDiff.make(downloaded_dataset.data) differ_r = DatasetDiff.make(timeseries_nyc.data) differ_l.compare(differ_r) assert not len(differ_l.my_ts)
def test_get_subset(): # CSV with a unique FIPS value for every region, even countries. In production countries are removed before # TimeseriesDataset is created. A future change may replace FIPS with a more general identifier. input_df = pd.read_csv( StringIO( "city,county,state,fips,country,aggregate_level,date,metric\n" "Smithville,,ZZ,97123,USA,city,2020-03-23,smithville-march23\n" "New York City,,ZZ,97324,USA,city,2020-03-22,march22-nyc\n" "New York City,,ZZ,97324,USA,city,2020-03-24,march24-nyc\n" ",North County,ZZ,97001,USA,county,2020-03-23,county-metric\n" ",,ZZ,97,USA,state,2020-03-23,mystate\n" ",,XY,96,USA,state,2020-03-23,other-state\n" ",,,iso2:uk,UK,country,2020-03-23,you-kee\n" ",,,iso2:us,US,country,2020-03-23,you-ess-hey\n")) ts = TimeseriesDataset(input_df) assert set(ts.get_subset(AggregationLevel.COUNTRY).data["metric"]) == { "you-kee", "you-ess-hey" } assert set( ts.get_subset(AggregationLevel.COUNTRY, country="UK").data["country"]) == {"UK"} assert set(ts.get_subset( AggregationLevel.STATE).data["metric"]) == {"mystate", "other-state"} assert set(ts.get_subset( state="ZZ", after="2020-03-23").data["metric"]) == {"march24-nyc"} assert set(ts.get_subset(state="ZZ", after="2020-03-22").data["metric"]) == { "smithville-march23", "county-metric", "mystate", "march24-nyc", } assert set( ts.get_subset(AggregationLevel.STATE, states=["ZZ", "XY"]).data["metric"]) == { "mystate", "other-state", } assert set(ts.get_subset(states=["ZZ"], on="2020-03-23").data["metric"]) == { "smithville-march23", "county-metric", "mystate", } assert set( ts.get_subset(states=["ZZ"], before="2020-03-23").data["metric"]) == {"march22-nyc"}
def get_hospitalization_data(): """ Since we're using this data for hospitalized data only, only returning values with hospitalization data. I think as the use cases of this data source expand, we may not want to drop. For context, as of 4/8 607/1821 rows contained hospitalization data. Returns ------- TimeseriesDataset """ data = combined_datasets.load_us_timeseries_dataset().data has_current_hospital = data[CommonFields.CURRENT_HOSPITALIZED].notnull() has_cumulative_hospital = data[ CommonFields.CUMULATIVE_HOSPITALIZED].notnull() return TimeseriesDataset(data[has_current_hospital | has_cumulative_hospital])
def test_get_subset_and_get_data(): input_df = pd.read_csv( StringIO( "city,county,state,fips,country,aggregate_level,date,metric\n" "Smithville,,ZZ,97123,USA,city,2020-03-23,smithville-march23\n" "New York City,,ZZ,97324,USA,city,2020-03-22,march22-nyc\n" "New York City,,ZZ,97324,USA,city,2020-03-24,march24-nyc\n" ",North County,ZZ,97001,USA,county,2020-03-23,county-metric\n" ",,ZZ,97001,USA,state,2020-03-23,mystate\n" ",,XY,96001,USA,state,2020-03-23,other-state\n" ",,,,UK,country,2020-03-23,you-kee\n" ",,,,US,country,2020-03-23,you-ess-hey\n")) ts = TimeseriesDataset(input_df) assert set(ts.get_subset(AggregationLevel.COUNTRY).data["metric"]) == { "you-kee", "you-ess-hey" } assert set( ts.get_subset(AggregationLevel.COUNTRY, country="UK").data["country"]) == {"UK"} assert set(ts.get_subset( AggregationLevel.STATE).data["metric"]) == {"mystate", "other-state"} assert set(ts.get_data(None, state="ZZ", after="2020-03-23")["metric"]) == {"march24-nyc"} assert set(ts.get_data(None, state="ZZ", after="2020-03-22")["metric"]) == { "smithville-march23", "county-metric", "mystate", "march24-nyc", } assert set( ts.get_data(AggregationLevel.STATE, states=["ZZ", "XY"])["metric"]) == { "mystate", "other-state", } assert set(ts.get_data(None, states=["ZZ"], on="2020-03-23")["metric"]) == { "smithville-march23", "county-metric", "mystate", } assert set( ts.get_data(None, states=["ZZ"], before="2020-03-23")["metric"]) == {"march22-nyc"}
def test_write_csv(): df = pd.DataFrame({ CommonFields.DATE: pd.to_datetime(["2020-04-01", "2020-04-02"]), CommonFields.FIPS: ["06045", "45123"], CommonFields.CASES: [234, 456], }) ts = TimeseriesDataset(df) expected_csv = """,,summary,summary,summary,summary,summary,summary,summary,summary,summary,value,value date,,has_value,min_date,max_date,max_value,min_value,latest_value,num_observations,largest_delta,largest_delta_date,2020-04-01 00:00:00,2020-04-02 00:00:00 fips,variable,,,,,,,,,,, 06045,cases,True,2020-04-01,2020-04-01,234,234,234,1,,,234, 45123,cases,True,2020-04-02,2020-04-02,456,456,456,1,,,,456 """ # Call common_df.write_csv with index set to ["fips", "date"], the expected normal index. with temppathlib.NamedTemporaryFile("w+") as tmp: wide_dates_df.write_csv(ts.get_date_columns(), tmp.path) assert expected_csv == tmp.file.read()
def test_wide_dates(): input_df = read_csv_and_index_fips_date( "fips,county,aggregate_level,date,m1,m2\n" "97111,Bar County,county,2020-04-01,1,\n" "97111,Bar County,county,2020-04-02,2,\n" "97222,Foo County,county,2020-04-01,,10\n" "97222,Foo County,county,2020-04-03,3,30\n") provenance = provenance_wide_metrics_to_series( read_csv_and_index_fips_date("fips,date,m1,m2\n" "97111,2020-04-01,src11,\n" "97111,2020-04-02,src11,\n" "97222,2020-04-01,,src22\n" "97222,2020-04-03,src21,src22\n"), structlog.get_logger(), ) ts = TimeseriesDataset(input_df.reset_index(), provenance=provenance) date_columns = ts.get_date_columns() assert to_dict(["fips", "variable"], date_columns["value"]) == { ("97111", "m1"): { pd.to_datetime("2020-04-01"): 1.0, pd.to_datetime("2020-04-02"): 2.0 }, ("97222", "m1"): { pd.to_datetime("2020-04-03"): 3.0 }, ("97222", "m2"): { pd.to_datetime("2020-04-01"): 10.0, pd.to_datetime("2020-04-03"): 30.0 }, } assert to_dict(["fips", "variable"], date_columns["provenance"]) == { ("97111", "m1"): { "value": "src11" }, ("97222", "m1"): { "value": "src21" }, ("97222", "m2"): { "value": "src22" }, }
def test_make_latest_from_timeseries_dont_touch_county(): data = read_csv_and_index_fips_date( "fips,county,state,country,date,aggregate_level,m1,m2\n" "95123,Smith Countyy,YY,USA,2020-04-01,county,1,\n" "97123,Smith Countzz,ZZ,USA,2020-04-01,county,2,\n" "97,,ZZ,USA,2020-04-01,state,3,\n").reset_index() ts = TimeseriesDataset(data) assert to_dict(["fips"], ts.latest_values()[["fips", "county", "m1", "m2"]]) == { "95123": { "m1": 1, "county": "Smith Countyy" }, "97123": { "m1": 2, "county": "Smith Countzz" }, "97": { "m1": 3 }, }