def test_aggregate(): df_in = read_csv_and_index_fips_date( "fips,state,aggregate_level,county,m1,date,foo\n" "55005,ZZ,county,North County,1,2020-05-01,11\n" "55005,ZZ,county,North County,2,2020-05-02,22\n" "55005,ZZ,county,North County,3,2020-05-03,33\n" "55005,ZZ,county,North County,0,2020-05-04,0\n" "55006,ZZ,county,South County,0,2020-05-01,0\n" "55006,ZZ,county,South County,0,2020-05-02,0\n" "55006,ZZ,county,South County,3,2020-05-03,44\n" "55006,ZZ,county,South County,4,2020-05-04,55\n" "55,ZZ,state,Grand State,41,2020-05-01,66\n" "55,ZZ,state,Grand State,43,2020-05-03,77\n" ).reset_index() ts_in = MultiRegionDataset.from_fips_timeseries_df(df_in) agg = statistical_areas.CountyToCBSAAggregator( county_map={"55005": "10001", "55006": "10001"}, cbsa_title_map={"10001": "Stat Area 1"}, aggregations=[], ) ts_out = agg.aggregate(ts_in) assert ts_out.groupby_region().ngroups == 1 ts_cbsa = ts_out.get_one_region(Region.from_cbsa_code("10001")) assert ts_cbsa.date_indexed["m1"].to_dict() == { pd.to_datetime("2020-05-01"): 1, pd.to_datetime("2020-05-02"): 2, pd.to_datetime("2020-05-03"): 6, pd.to_datetime("2020-05-04"): 4, }
def make_dataset(cls) -> timeseries.MultiRegionDataset: """Default implementation of make_dataset that loads timeseries data from a CSV.""" assert cls.COMMON_DF_CSV_PATH, f"No path in {cls}" data_root = dataset_utils.LOCAL_PUBLIC_DATA_PATH input_path = data_root / cls.COMMON_DF_CSV_PATH data = common_df.read_csv(input_path, set_index=False) data = cls._check_data(data) return MultiRegionDataset.from_fips_timeseries_df( data).add_provenance_all(cls.SOURCE_NAME)
def _fips_csv_to_one_region(csv_str: str, region: Region, latest=None) -> OneRegionTimeseriesDataset: df = read_csv_and_index_fips_date(csv_str).reset_index() # from_timeseries_and_latest adds the location_id column needed by get_one_region dataset = MultiRegionDataset.from_fips_timeseries_df(df).get_one_region( region) if latest: return dataclasses.replace(dataset, latest=latest) else: return dataset
def test_load_from_local_public_data(): agg = statistical_areas.CountyToCBSAAggregator.from_local_public_data() agg = dataclasses.replace(agg, aggregations=[]) # Disable scaled aggregations assert agg.cbsa_title_map["43580"] == "Sioux City, IA-NE-SD" assert agg.county_map["48187"] == "41700" df_in = read_csv_and_index_fips_date( "fips,state,aggregate_level,county,m1,date,foo\n" "48059,ZZ,county,North County,3,2020-05-03,33\n" "48253,ZZ,county,South County,4,2020-05-03,77\n" "48441,ZZ,county,Other County,2,2020-05-03,41\n" ).reset_index() ts_in = MultiRegionDataset.from_fips_timeseries_df(df_in) ts_out = agg.aggregate(ts_in) ts_cbsa = ts_out.get_one_region(Region.from_cbsa_code("10180")) assert ts_cbsa.date_indexed["m1"].to_dict() == { pd.to_datetime("2020-05-03"): 9, }
def make_dataset(cls) -> timeseries.MultiRegionDataset: """Default implementation of make_dataset that loads data from the parquet file.""" assert cls.VARIABLES ccd_dataset = CanScraperBase._get_covid_county_dataset() data, source_urls_df = ccd_dataset.query_multiple_variables( cls.VARIABLES, log_provider_coverage_warnings=True) data = cls.transform_data(data) data = cls._check_data(data) ds = MultiRegionDataset.from_fips_timeseries_df( data).add_provenance_all(cls.SOURCE_NAME) if not source_urls_df.empty: # For each FIPS-VARIABLE pair keep the source_url row with the last DATE. source_urls_df = (source_urls_df.sort_values( CommonFields.DATE).groupby( [CommonFields.FIPS, PdFields.VARIABLE], sort=False).last().reset_index().drop( columns=[CommonFields.DATE])) source_urls_df[taglib.TagField.TYPE] = taglib.TagType.SOURCE_URL ds = ds.append_fips_tag_df(source_urls_df) return ds
def make_dataset(cls) -> timeseries.MultiRegionDataset: """Default implementation of make_dataset that loads timeseries data from a CSV.""" data = cls._load_data() data = cls._check_and_removed_unexpected_data(data) return MultiRegionDataset.from_fips_timeseries_df( data).add_tag_all_bucket(cls.source_tag())