def test_dataclass_include_exclude(): orig_data_source_cls = CANScraperUSAFactsProvider orig_ds = orig_data_source_cls.make_dataset() assert "iso1:us#iso2:us-tx" in orig_ds.static.index assert "iso1:us#iso2:us-ny" in orig_ds.static.index ny_source = combined_datasets.datasource_regions( orig_data_source_cls, RegionMask(states=["NY"]) ) assert ny_source.SOURCE_NAME == orig_data_source_cls.SOURCE_NAME assert ny_source.EXPECTED_FIELDS == orig_data_source_cls.EXPECTED_FIELDS ny_ds = ny_source.make_dataset() assert "iso1:us#iso2:us-tx" not in ny_ds.static.index assert "iso1:us#iso2:us-ny" in ny_ds.static.index ca_counties_without_la_source = combined_datasets.datasource_regions( orig_data_source_cls, RegionMask(AggregationLevel.COUNTY, states=["CA"]), exclude=Region.from_fips("06037"), ) ds = ca_counties_without_la_source.make_dataset() assert "iso1:us#iso2:us-tx" not in ds.static.index assert "iso1:us#iso2:us-ca" not in ds.static.index assert "iso1:us#iso2:us-ca#fips:06045" in ds.static.index assert "iso1:us#iso2:us-ca#fips:06037" not in ds.static.index # Just Cook County, IL ds = combined_datasets.datasource_regions( orig_data_source_cls, include=Region.from_fips("17031") ).make_dataset() assert ds.static.index.to_list() == ["iso1:us#iso2:us-il#fips:17031"]
def test_multi_region_get_one_region(): ts = timeseries.MultiRegionTimeseriesDataset.from_csv( io.StringIO( "location_id,county,aggregate_level,date,m1,m2\n" "iso1:us#fips:97111,Bar County,county,2020-04-02,2,\n" "iso1:us#fips:97222,Foo County,county,2020-04-01,,10\n" "iso1:us#fips:97111,Bar County,county,,3,\n" "iso1:us#fips:97222,Foo County,county,,,11\n" ) ) region_97111_ts = ts.get_one_region(Region.from_fips("97111")) assert to_dict(["date"], region_97111_ts.data[["date", "m1", "m2"]]) == { pd.to_datetime("2020-04-02"): {"m1": 2} } assert region_97111_ts.latest["m1"] == 3 region_97222_ts = ts.get_one_region(Region.from_fips("97222")) assert to_dict(["date"], region_97222_ts.data) == { pd.to_datetime("2020-04-01"): { "m2": 10, "county": "Foo County", "fips": "97222", "location_id": "iso1:us#fips:97222", "aggregate_level": "county", } } assert region_97222_ts.latest["m2"] == 11
def test_multi_region_to_from_timeseries_and_latest_values(tmp_path: pathlib.Path): ts = timeseries.TimeseriesDataset( read_csv_and_index_fips_date( "fips,county,aggregate_level,date,m1,m2\n" "97111,Bar County,county,2020-04-02,2,\n" "97222,Foo County,county,2020-04-01,,10\n" "01,,state,2020-04-01,,20\n" ).reset_index() ) latest_values = timeseries.LatestValuesDataset( read_csv_and_index_fips( "fips,county,aggregate_level,c1,c2\n" "97111,Bar County,county,3,\n" "97222,Foo County,county,4,10.5\n" "01,,state,,123.4\n" ).reset_index() ) multiregion = timeseries.MultiRegionTimeseriesDataset.from_timeseries_and_latest( ts, latest_values ) region_97111 = multiregion.get_one_region(Region.from_fips("97111")) assert region_97111.date_indexed.at["2020-04-02", "m1"] == 2 assert region_97111.latest["c1"] == 3 assert multiregion.get_one_region(Region.from_fips("01")).latest["c2"] == 123.4 csv_path = tmp_path / "multiregion.csv" multiregion.to_csv(csv_path) multiregion_loaded = timeseries.MultiRegionTimeseriesDataset.from_csv(csv_path) region_97111 = multiregion_loaded.get_one_region(Region.from_fips("97111")) assert region_97111.date_indexed.at["2020-04-02", "m1"] == 2 assert region_97111.latest["c1"] == 3 assert multiregion_loaded.get_one_region(Region.from_fips("01")).latest["c2"] == 123.4
def test_regions_in_states_basic(): whitelist_df = read_csv_and_index_fips( "fips,state,county,inference_ok\n" "45111,TX,Bar County,True\n" "06222,CA,Foo County,True\n").reset_index() regions = regions_in_states( [pipeline.Region.from_state(s) for s in ["CA", "TX"]], whitelist_df) assert set(regions) == { Region.from_fips("45111"), Region.from_fips("06222") }
def test_top_level_metrics_with_rt(): region = Region.from_fips("36") data = ( "date,fips,cases,positive_tests,negative_tests,contact_tracers_count" ",current_icu,current_icu_total,icu_beds\n" "2020-08-17,36,10,10,90,1,,,\n" "2020-08-18,36,20,20,180,2,,,\n" "2020-08-19,36,,,,3,,,\n" "2020-08-20,36,40,40,360,4,,,\n") one_region = _fips_csv_to_one_region(data, region) data = ("date,fips,Rt_MAP_composite,Rt_ci95_composite\n" "2020-08-17,36,1.1,1.2\n" "2020-08-18,36,1.2,1.3\n" "2020-08-19,36,1.1,1.3\n" "2020-08-20,36,1.1,1.2\n") rt_data = _fips_csv_to_one_region(data, region) latest = { CommonFields.POPULATION: 100_000, CommonFields.FIPS: "36", CommonFields.STATE: "NY", CommonFields.ICU_TYPICAL_OCCUPANCY_RATE: 0.5, CommonFields.ICU_BEDS: 25, }
def test_combined_county_has_some_data(fips): region_data = combined_datasets.load_us_timeseries_dataset().get_one_region( Region.from_fips(fips) ) assert region_data.data[CommonFields.POSITIVE_TESTS].all() assert region_data.data[CommonFields.NEGATIVE_TESTS].all() assert region_data.latest[CommonFields.DEATHS] > 1
def test_calculate_icu_capacity(): region = Region.from_fips("36") latest = { CommonFields.POPULATION: 100_000, CommonFields.FIPS: "36", CommonFields.STATE: "NY", }
def test_dataclass_include_exclude(): """Tests datasource_regions using mock data for speed.""" region_data = {CommonFields.CASES: [100, 200, 300], CommonFields.DEATHS: [0, 1, 2]} regions_orig = [Region.from_state(state) for state in "AZ CA NY IL TX".split()] + [ Region.from_fips(fips) for fips in "06037 06045 17031 17201".split() ] dataset_orig = test_helpers.build_dataset({region: region_data for region in regions_orig}) # Make a new subclass to keep this test separate from others in the make_dataset lru_cache. class DataSourceForTest(data_source.DataSource): EXPECTED_FIELDS = [CommonFields.CASES, CommonFields.DEATHS] SOURCE_TYPE = "DataSourceForTest" @classmethod def make_dataset(cls) -> timeseries.MultiRegionDataset: return dataset_orig orig_data_source_cls = DataSourceForTest orig_ds = orig_data_source_cls.make_dataset() assert "iso1:us#iso2:us-tx" in orig_ds.location_ids assert "iso1:us#iso2:us-ny" in orig_ds.location_ids ny_source = combined_datasets.datasource_regions( orig_data_source_cls, RegionMask(states=["NY"]) ) ny_ds = ny_source.make_dataset() assert "iso1:us#iso2:us-tx" not in ny_ds.location_ids assert "iso1:us#iso2:us-ny" in ny_ds.location_ids ca_counties_without_la_source = combined_datasets.datasource_regions( orig_data_source_cls, RegionMask(AggregationLevel.COUNTY, states=["CA"]), exclude=Region.from_fips("06037"), ) ds = ca_counties_without_la_source.make_dataset() assert "iso1:us#iso2:us-tx" not in ds.location_ids assert "iso1:us#iso2:us-ca" not in ds.location_ids assert "iso1:us#iso2:us-ca#fips:06045" in ds.location_ids assert "iso1:us#iso2:us-ca#fips:06037" not in ds.location_ids # Just Cook County, IL ds = combined_datasets.datasource_regions( orig_data_source_cls, include=Region.from_fips("17031") ).make_dataset() assert ds.location_ids.to_list() == ["iso1:us#iso2:us-il#fips:17031"]
def test_top_level_metrics_with_rt(): region = Region.from_fips("36") latest = { CommonFields.POPULATION: 100_000, CommonFields.FIPS: "36", CommonFields.STATE: "NY", CommonFields.ICU_TYPICAL_OCCUPANCY_RATE: 0.5, CommonFields.ICU_BEDS: 25, }
def test_load_hospitalization_data(): t0 = datetime(year=2020, month=1, day=1) region = Region.from_fips("33") hospitalization_df = load_data.get_hospitalization_data_for_region(region) _, _, hosp_type = load_data.calculate_hospitalization_data( hospitalization_df, t0, category=HospitalizationCategory.ICU ) # Double check that data loads and it went throughh the cumulative hosps assert hosp_type is HospitalizationDataType.CUMULATIVE_HOSPITALIZATIONS
def test_make_latest_from_timeseries_simple(): data = read_csv_and_index_fips_date( "fips,county,state,country,date,aggregate_level,m1,m2\n" "97123,Smith County,ZZ,USA,2020-04-01,county,1,\n" "97123,Smith County,ZZ,USA,2020-04-02,county,,2\n" ).reset_index() ds = timeseries.MultiRegionDataset.from_fips_timeseries_df(data) region = ds.get_one_region(Region.from_fips("97123")) # Compare 2 values in region.latest expected = {"m1": 1, "m2": 2} actual = {key: region.latest[key] for key in expected.keys()} assert actual == expected
def test_persist_and_load_dataset(tmp_path, nyc_fips): region = Region.from_fips(nyc_fips) dataset = combined_datasets.load_us_timeseries_dataset() timeseries_nyc = TimeseriesDataset(dataset.get_one_region(region).data) pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path) downloaded_dataset = pointer.load_dataset() differ_l = DatasetDiff.make(downloaded_dataset.data) differ_r = DatasetDiff.make(timeseries_nyc.data) differ_l.compare(differ_r) assert not len(differ_l.my_ts)
def update_test_combined_data(truncate_dates: bool, state: List[str]): us_dataset = combined_datasets.load_us_timeseries_dataset() # Keep only a small subset of the regions so we have enough to exercise our code in tests. test_subset = us_dataset.get_regions_subset([ RegionMask(states=[s.strip() for s in state]), Region.from_fips("48201"), Region.from_fips("48301"), Region.from_fips("20161"), Region.from_state("TX"), Region.from_state("KS"), ]) if truncate_dates: dates = test_subset.timeseries_bucketed.index.get_level_values( CommonFields.DATE) date_range_mask = (dates >= "2021-01-01") & (dates < "2021-04-01") test_subset = dataclasses.replace( test_subset, timeseries_bucketed=test_subset.timeseries_bucketed. loc[date_range_mask]) test_subset.write_to_wide_dates_csv( dataset_utils.TEST_COMBINED_WIDE_DATES_CSV_PATH, dataset_utils.TEST_COMBINED_STATIC_CSV_PATH)
def test_combined_county_has_some_timeseries_data(fips): region = Region.from_fips(fips) latest = combined_datasets.load_us_timeseries_dataset().get_one_region( region) df = latest.data.set_index(CommonFields.DATE) assert df.loc["2020-05-01", CommonFields.CASES] > 0 assert df.loc["2020-05-01", CommonFields.DEATHS] > 0 if fips.startswith( "06" ): # TODO(tom): Remove this condition when we have county data in TX too. assert df.loc["2020-05-01", CommonFields.POSITIVE_TESTS] > 0 assert df.loc["2020-05-01", CommonFields.NEGATIVE_TESTS] > 0 assert df.loc["2020-05-01", CommonFields.CURRENT_ICU] > 0
def test_persist_and_load_dataset(tmp_path, nyc_fips): region = Region.from_fips(nyc_fips) dataset = combined_datasets.load_us_timeseries_dataset() timeseries_nyc = dataset.get_regions_subset([region]) pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path) downloaded_dataset = MultiRegionDataset.read_from_pointer(pointer) differ_l = DatasetDiff.make(downloaded_dataset.timeseries) differ_r = DatasetDiff.make(timeseries_nyc.timeseries) differ_l.compare(differ_r) assert not len(differ_l.my_ts)
def test_make_latest_from_timeseries_dont_touch_county(): data = read_csv_and_index_fips_date( "fips,county,state,country,date,aggregate_level,m1,m2\n" "95123,Smith Countyy,YY,USA,2020-04-01,county,1,\n" "97123,Smith Countzz,ZZ,USA,2020-04-01,county,2,\n" "56,,WY,USA,2020-04-01,state,3,\n" ).reset_index() ds = timeseries.MultiRegionDataset.from_fips_timeseries_df(data) def get_latest(region) -> dict: """Returns an interesting subset of latest for given region""" latest = ds.get_one_region(region).latest return {key: latest[key] for key in ["county", "m1", "m2"] if latest.get(key) is not None} assert get_latest(Region.from_fips("95123")) == { "m1": 1, "county": "Smith Countyy", } assert get_latest(Region.from_fips("97123")) == { "m1": 2, "county": "Smith Countzz", } assert get_latest(Region.from_state("WY")) == {"m1": 3}
def test_top_level_metrics_no_test_positivity(): data = ( "date,fips,cases,positive_tests,negative_tests,contact_tracers_count,current_icu,icu_beds\n" "2020-08-17,36,10,,,1,,\n" "2020-08-18,36,20,,,2,,\n" "2020-08-19,36,30,,,3,,\n" "2020-08-20,36,40,,,4,,\n") one_region = _fips_csv_to_one_region(data, Region.from_fips("36")) latest = { CommonFields.POPULATION: 100_000, CommonFields.FIPS: "36", CommonFields.STATE: "NY", CommonFields.ICU_BEDS: 10, }
def test_pyseir_end_to_end_idaho(tmp_path): # This covers a lot of edge cases. with unittest.mock.patch("pyseir.utils.OUTPUT_DIR", str(tmp_path)): fips = "16001" region = Region.from_fips(fips) pipelines = cli._build_all_for_states(states=["ID"], fips=fips) cli._write_pipeline_output(pipelines, tmp_path) icu_data_path = tmp_path / SummaryArtifact.ICU_METRIC_COMBINED.value icu_data = MultiRegionTimeseriesDataset.from_csv(icu_data_path) assert icu_data.get_one_region(region) rt_data_path = tmp_path / SummaryArtifact.RT_METRIC_COMBINED.value rt_data = MultiRegionTimeseriesDataset.from_csv(rt_data_path) assert rt_data.get_one_region(region)
def test_pyseir_end_to_end_idaho(tmp_path): # This covers a lot of edge cases. with unittest.mock.patch("pyseir.utils.OUTPUT_DIR", str(tmp_path)): fips = "16001" region = Region.from_fips(fips) # prepare data one_region_input = combined_datasets.load_us_timeseries_dataset().get_one_region(region) region_pipelines = [OneRegionPipeline.run(one_region_input)] region_pipelines = _patch_nola_infection_rate_in_pipelines(region_pipelines) model_output = pyseir.run.PyseirOutputDatasets.from_pipeline_output(region_pipelines) assert model_output.icu.get_one_region(region) assert model_output.infection_rate.get_one_region(region)
def test_top_level_metrics_basic(): data = ( "date,fips,cases,positive_tests,negative_tests,contact_tracers_count" ",current_icu,current_icu_total,icu_beds\n" "2020-08-17,36,10,10,90,1,10,20,\n" "2020-08-18,36,20,20,180,2,10,20,\n" "2020-08-19,36,,,,3,10,20,\n" "2020-08-20,36,40,40,360,4,10,20,\n") one_region = _fips_csv_to_one_region(data, Region.from_fips("36")) latest = { CommonFields.POPULATION: 100_000, CommonFields.FIPS: "36", CommonFields.STATE: "NY", CommonFields.ICU_TYPICAL_OCCUPANCY_RATE: 0.5, CommonFields.ICU_BEDS: 30, }
def test_combined_county_has_some_timeseries_data(fips): region = Region.from_fips(fips) latest = combined_datasets.load_us_timeseries_dataset().get_one_region(region) date = "2020-09-04" # Arbitrary date when both regions have data df = latest.data.set_index(CommonFields.DATE) one_date = df.loc[date] assert one_date[CommonFields.CASES] > 0 assert one_date[CommonFields.DEATHS] > 0 # Check that there is some testing data, either positive and negative tests or a test # positivity ratio. assert ( one_date[CommonFields.POSITIVE_TESTS] > 0 and one_date[CommonFields.NEGATIVE_TESTS] > 0 ) or ( one_date[CommonFields.TEST_POSITIVITY_7D] > 0 or one_date[CommonFields.TEST_POSITIVITY_14D] > 0 ) assert one_date[CommonFields.CURRENT_ICU] > 0
def _transform_one_override( override: Mapping, cbsa_to_counties_map: Mapping[Region, List[Region]]) -> Filter: region_str = override["region"] if re.fullmatch(r"[A-Z][A-Z]", region_str): region = Region.from_state(region_str) elif re.fullmatch(r"\d{5}", region_str): region = Region.from_fips(region_str) else: raise ValueError(f"Invalid region: {region_str}") include_str = override["include"] if include_str == "region": regions_included = [region] elif include_str == "region-and-subregions": if region.is_state(): regions_included = [RegionMask(states=[region.state])] elif region.level == AggregationLevel.CBSA: regions_included = [region] + cbsa_to_counties_map[region] else: raise ValueError( "region-and-subregions only valid for a state and CBSA") elif include_str == "subregions": if not region.is_state(): raise ValueError("subregions only valid for a state") regions_included = [ RegionMask(AggregationLevel.COUNTY, states=[region.state]) ] else: raise ValueError(f"Invalid include: {include_str}") return Filter( regions_included=regions_included, fields_included=_METRIC_TO_FIELDS[override["metric"]], internal_note=override["context"], public_note=override.get("disclaimer", ""), drop_observations=bool(override["blocked"]), )
def test_basic(): region_tx = Region.from_state("TX") region_sf = Region.from_fips("06075") region_hi = Region.from_state("HI") # Add a timeseries with a tag to make sure they are preserved. ts_with_tag = TimeseriesLiteral( [0, 0, 0], annotation=[test_helpers.make_tag(date="2020-04-01")]) ds_in = test_helpers.build_dataset({ region_tx: { CommonFields.VACCINES_DISTRIBUTED: [0, 0, 0] }, region_sf: { CommonFields.VACCINES_DISTRIBUTED: [0, 0, 1] }, region_hi: { CommonFields.VACCINES_DISTRIBUTED: [0, 0, None], CommonFields.CASES: ts_with_tag, }, }) with structlog.testing.capture_logs() as logs: ds_out = zeros_filter.drop_all_zero_timeseries( ds_in, [CommonFields.VACCINES_DISTRIBUTED]) ds_expected = test_helpers.build_dataset({ region_sf: { CommonFields.VACCINES_DISTRIBUTED: [0, 0, 1] }, region_hi: { CommonFields.CASES: ts_with_tag }, }) log = more_itertools.one(logs) assert log["event"] == zeros_filter.DROPPING_TIMESERIES_WITH_ONLY_ZEROS assert pd.MultiIndex.from_tuples([ (region_hi.location_id, CommonFields.VACCINES_DISTRIBUTED), (region_tx.location_id, CommonFields.VACCINES_DISTRIBUTED), ]).equals(log["dropped"]) test_helpers.assert_dataset_like(ds_expected, ds_out)
def test_load_hospitalization_data_not_found(): region = Region.from_fips("98") hospitalization_df = load_data.get_hospitalization_data_for_region(region) assert hospitalization_df.empty
def test_get_county_name(): assert combined_datasets.get_county_name(Region.from_fips("06059")) == "Orange County" assert combined_datasets.get_county_name(Region.from_fips("48201")) == "Harris County"
def test_pr_aggregation(): dataset = combined_datasets.load_us_timeseries_dataset() data = dataset.get_one_region(Region.from_fips("72")).latest assert data assert data["all_beds_occupancy_rate"] < 1 assert data["icu_occupancy_rate"] < 1
from covidactnow.datapublic.common_fields import CommonFields from covidactnow.datapublic.common_fields import FieldName from covidactnow.datapublic.common_fields import PdFields from libs.datasets import taglib from libs.datasets import timeseries from libs.datasets.taglib import TagField from libs.datasets.taglib import TagType from libs.datasets.taglib import UrlStr from libs.pipeline import Region # This is a totally bogus fips/region/location that we've been using as a default in some test # cases. It is factored out here in an attempt to reduce how much it is hard-coded into our source. DEFAULT_FIPS = "97222" DEFAULT_REGION = Region.from_fips(DEFAULT_FIPS) T = TypeVar("T") def _to_list(list_or_scalar: Union[None, T, List[T]]) -> List[T]: """Returns a list which may be empty, contain the single non-list parameter or the parameter.""" if isinstance(list_or_scalar, List): return list_or_scalar elif list_or_scalar: return [list_or_scalar] else: return []
def get_subset_regions(exclude_county_999: bool, **kwargs) -> List[Region]: us_latest = load_us_latest_dataset() us_subset = us_latest.get_subset(exclude_county_999=exclude_county_999, **kwargs) return [Region.from_fips(fips) for fips in us_subset.data[CommonFields.FIPS].unique()]
import dataclasses from functools import lru_cache from libs.datasets import AggregationLevel from libs.datasets import data_source import pandas as pd from covidactnow.datapublic.common_fields import CommonFields from libs.datasets.sources import can_scraper_helpers as ccd_helpers from libs.datasets.timeseries import MultiRegionDataset from libs.pipeline import Region DC_COUNTY_LOCATION_ID = Region.from_fips("11001").location_id DC_STATE_LOCATION_ID = Region.from_state("DC").location_id def _remove_trailing_zeros(series: pd.Series) -> pd.Series: series = pd.Series(series.values.copy(), index=series.index.get_level_values(CommonFields.DATE)) index = series.loc[series != 0].last_valid_index() if index is None: # If test positivity is 0% the entire time, considering the data inaccurate, returning # none. series[:] = None return series series[index + pd.DateOffset(1):] = None return series