def add_county_pop(df: pd.DataFrame, gmpr: GeoMapper): """ Add county populations to the data with special US territory handling. Since Guam, Northern Mariana Islands, American Samoa, and the Virgin Islands are reported as megafips instead of actual counties in JHU, they would normally not have a population added. In addition to adding populations for the non-territory counties, this function adds in the entire territory's population for the 4 aforementioned regions. Parameters ---------- df DataFrame with county level information and county column named "fips" gmpr GeoMapper Returns ------- Dataframe with added population column """ is_territory_mega = df.fips.isin(["78000", "69000", "66000", "60000"]) territories = df[is_territory_mega] territories_state_id = gmpr.add_geocode(territories, "fips", "state_id") territories_pop = gmpr.add_population_column(territories_state_id, "state_id", dropna=False) territories_pop.drop("state_id", axis=1, inplace=True) nonterritories = df[~is_territory_mega] nonterritories_pop = gmpr.add_population_column(nonterritories, "fips", dropna=False) return pd.concat([nonterritories_pop, territories_pop], ignore_index=True)
def aggregate(df, metric, geo_res): """ Aggregate signals to appropriate resolution. Parameters ---------- df: pd.DataFrame Zip Code-level data with prepared metrics (output of construct_metrics(). metric: str Name of metric to be exported. geo_resolution: str One of ('county', 'hrr, 'msa', 'state', 'hhs', 'nation') Returns ------- pd.DataFrame: DataFrame with one row per geo_id, with columns for the individual signals. """ df = df.copy() metric_count_name = "_".join([metric, "num"]) metric_prop_name = "_".join([metric, "prop"]) gmpr = GeoMapper() geo_key = GEO_KEY_DICT[geo_res] df = gmpr.add_population_column(df, "zip") df = gmpr.replace_geocode(df, "zip", geo_key, date_col="timestamp", data_cols=[metric_count_name, "population"]) df[metric_prop_name] = df[metric_count_name] / df["population"] \ * INCIDENCE_BASE return df.rename({geo_key: "geo_id"}, axis=1)
def test_msa_hrr(self, jhu_confirmed_test_data): for geo in ["msa", "hrr"]: test_df = jhu_confirmed_test_data new_df = geo_map(test_df, geo, "cumulative_prop") gmpr = GeoMapper() test_df = gmpr.add_population_column(test_df, "fips") test_df = gmpr.replace_geocode(test_df, "fips", geo, date_col="timestamp") new_df = new_df.set_index(["geo_id", "timestamp"]).sort_index() test_df = test_df.set_index([geo, "timestamp"]).sort_index() # Check that the non-proportional columns are identical assert new_df.eq(test_df)[[ "new_counts", "population", "cumulative_counts" ]].all().all() # Check that the proportional signals are identical exp_incidence = test_df["new_counts"] / test_df[ "population"] * INCIDENCE_BASE expected_cumulative_prop = test_df["cumulative_counts"] / test_df["population"] *\ INCIDENCE_BASE assert new_df["incidence"].eq(exp_incidence).all() assert new_df["cumulative_prop"].eq(expected_cumulative_prop).all() # Make sure the prop signals don't have inf values assert not new_df["incidence"].eq(np.inf).any() assert not new_df["cumulative_prop"].eq(np.inf).any()
def pull_jhu_data(base_url: str, metric: str, gmpr: GeoMapper) -> pd.DataFrame: """Pull the latest Johns Hopkins CSSE data, and conform it into a dataset. The output dataset has: - Each row corresponds to (County, Date), denoted (FIPS, timestamp) - Each row additionally has a column `new_counts` corresponding to the new new_counts (either `confirmed` cases or `deaths`), and a column `cumulative_counts`, correspond to the aggregate metric from January 22nd (as of April 27th) until the latest date. Note that the raw dataset gives the `cumulative_counts` metric, from which we compute `new_counts` by taking first differences. Hence, `new_counts` may be negative. This is wholly dependent on the quality of the raw dataset. We filter the data such that we only keep rows with valid FIPS or "FIPS" codes defined under the exceptions of the README. Parameters ---------- base_url: str Base URL for pulling the JHU CSSE data. metric: str One of 'confirmed' or 'deaths'. gmpr: GeoMapper An instance of the geomapping utility. Returns ------- pd.DataFrame Dataframe as described above. """ df = download_data(base_url, metric) gmpr = GeoMapper() df = gmpr.replace_geocode( df, "jhu_uid", "fips", from_col="UID", date_col="timestamp" ) # Merge in population, set population as NAN for fake fips df = gmpr.add_population_column(df, "fips") df = create_diffs_column(df) # Final sanity checks sanity_check_data(df) # Reorder columns df = df[["fips", "timestamp", "population", "new_counts", "cumulative_counts"]] return df