def convert_geo(df: pd.DataFrame, geo: str, gmpr: GeoMapper) -> pd.DataFrame: """ Map a DataFrame to desired regions. The HHS facility level data contains columns for zip, state, and fips. For state and fips, we use them as given. For all other geos, we map from zip (the smallest of the regions) to the desired geo. Parameters ---------- df: pd.DataFrame Input DataFrame containing zip, state, and fips columns. geo: Desired new geographic resolution. gmpr: GeoMapper object. Returns ------- DataFrame containing new geography column `geo_id` in the `geo` resolution. """ if geo == "county": output_df = df.copy() output_df["geo_id"] = output_df["fips_code"] elif geo == "state": output_df = df.copy() output_df["geo_id"] = output_df["state"] elif geo == "hrr": # use zip for HRR since zips nest within HRR while FIPS split across HRRs. output_df = gmpr.add_geocode(df, "zip", geo) output_df["geo_id"] = output_df[geo] else: output_df = gmpr.add_geocode(df, "fips", geo, from_col="fips_code") output_df["geo_id"] = output_df[geo] return output_df
def aggregate(df, signal_names, geo_resolution='county'): """Aggregate signals to appropriate resolution and produce standard errors. Parameters ---------- df: pd.DataFrame County block group-level data with prepared signals (output of construct_signals(). signal_names: List[str] Names of signals to be exported. geo_resolution: str One of ('county', 'state') Returns ------- pd.DataFrame: DataFrame with one row per geo_id, with columns for the individual signals, standard errors, and sample sizes. """ # Prepare geo resolution gmpr = GeoMapper() if geo_resolution == 'county': geo_transformed_df = df.copy() geo_transformed_df['geo_id'] = df['county_fips'] elif geo_resolution == 'state': geo_transformed_df = gmpr.add_geocode(df, from_col='county_fips', from_code='fips', new_code='state_id', new_col='geo_id', dropna=False) elif geo_resolution in ['msa', 'nation', 'hrr', 'hhs']: geo_transformed_df = gmpr.add_geocode(df, from_col='county_fips', from_code='fips', new_code=geo_resolution, new_col='geo_id', dropna=False) else: raise ValueError( f'`geo_resolution` must be one of {GEO_RESOLUTIONS}.') # Aggregation and signal creation grouped_df = geo_transformed_df.groupby(['geo_id'])[signal_names] df_mean = grouped_df.mean() df_sd = grouped_df.std() df_n = grouped_df.count() agg_df = pd.DataFrame.join(df_mean, df_sd, lsuffix='_mean', rsuffix='_sd') agg_df = pd.DataFrame.join(agg_df, df_n.rename({ signal: signal + '_n' for signal in signal_names }, axis=1)) for signal in signal_names: agg_df[f'{signal}_se'] = (agg_df[f'{signal}_sd'] / np.sqrt(agg_df[f'{signal}_n'])) return agg_df.reset_index()
def run_module(params): """ Runs the indicator Arguments -------- params: Dict[str, Any] Nested dictionary of parameters. """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) mapper = GeoMapper() run_stats = [] ## build the base version of the signal at the most detailed geo level you can get. ## compute stuff here or farm out to another function or file all_data = pd.DataFrame( columns=["timestamp", "val", "zip", "sample_size", "se"]) ## aggregate & smooth ## TODO: add num/prop variations if needed for sensor, smoother, geo in product(SIGNALS, SMOOTHERS, GEOS): df = mapper.replace_geocode(all_data, "zip", geo, new_col="geo_id", date_col="timestamp") ## TODO: recompute sample_size, se here if not NA df["val"] = df[["geo_id", "val"]].groupby("geo_id")["val"].transform( smoother[0].smooth) sensor_name = sensor + smoother[ 1] ## TODO: +num/prop variation if used # don't export first 6 days for smoothed signals since they'll be nan. start_date = min(df.timestamp) + timedelta(6) if smoother[1] else min( df.timestamp) dates = create_export_csv(df, params["common"]["export_dir"], geo, sensor_name, start_date=start_date) if len(dates) > 0: run_stats.append((max(dates), len(dates))) ## log this indicator run elapsed_time_in_seconds = round(time.time() - start_time, 2) min_max_date = run_stats and min(s[0] for s in run_stats) csv_export_count = sum(s[-1] for s in run_stats) max_lag_in_days = min_max_date and (datetime.now() - min_max_date).days formatted_min_max_date = min_max_date and min_max_date.strftime("%Y-%m-%d") logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds, csv_export_count=csv_export_count, max_lag_in_days=max_lag_in_days, oldest_final_export_date=formatted_min_max_date)
def run_module(params) -> None: """ Run entire hhs_facilities indicator. Parameters ---------- params Dictionary containing indicator configuration. Expected to have the following structure: - "common": - "export_dir": str, directory to write output """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) raw_df = pull_data() gmpr = GeoMapper() filled_fips_df = fill_missing_fips(raw_df, gmpr) for geo, (sig_name, sig_cols, sig_func, sig_offset) in product(GEO_RESOLUTIONS, SIGNALS): mapped_df = convert_geo(filled_fips_df, geo, gmpr) output_df = generate_signal(mapped_df, sig_cols, sig_func, sig_offset) create_export_csv(output_df, params["common"]["export_dir"], geo, sig_name) elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)
def test_hrr_msa(self): """Tests that values are correctly aggregated at the HRR and MSA level.""" df = pd.DataFrame({ "fips": ["13009", "13017", "13021", "09015"], "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15", "2020-02-15"], "new_counts": [10, 15, 2, 13], "cumulative_counts": [100, 20, 45, 60], }) hrr_df = geo_map(df, "hrr", SENSOR) msa_df = geo_map(df, "msa", SENSOR) assert msa_df.shape == (2, 7) gmpr = GeoMapper() df = gmpr.add_population_column(df, "fips") assert np.isclose(hrr_df.new_counts.sum(), df.new_counts.sum()) assert np.isclose(hrr_df.population.sum(), df.population.sum()) assert hrr_df.shape == (5, 7)
def test_good_file(self): df = pull_nchs_mortality_data(TOKEN, "test_data.csv") # Test columns assert (df.columns.values == [ 'covid_19_deaths', 'total_deaths', 'percent_of_expected_deaths', 'pneumonia_deaths', 'pneumonia_and_covid_19_deaths', 'influenza_deaths', 'pneumonia_influenza_or_covid_19_deaths', "timestamp", "geo_id", "population" ]).all() # Test aggregation for NYC and NY raw_df = pd.read_csv("./test_data/test_data.csv", parse_dates=["start_week"]) raw_df = standardize_columns(raw_df) for metric in METRICS: ny_list = raw_df.loc[(raw_df["state"] == "New York") & (raw_df[metric].isnull()), "timestamp"].values nyc_list = raw_df.loc[(raw_df["state"] == "New York City") & (raw_df[metric].isnull()), "timestamp"].values final_list = df.loc[(df["geo_id"] == "ny") & (df[metric].isnull()), "timestamp"].values assert set(final_list) == set(ny_list).intersection(set(nyc_list)) # Test missing value gmpr = GeoMapper() state_ids = pd.DataFrame(list(gmpr.get_geo_values("state_id"))) state_names = gmpr.replace_geocode(state_ids, "state_id", "state_name", from_col=0, date_col=None) for state, geo_id in zip(state_names, state_ids): if state in set(["New York", "New York City"]): continue for metric in METRICS: test_list = raw_df.loc[(raw_df["state"] == state) & (raw_df[metric].isnull()), "timestamp"].values final_list = df.loc[(df["geo_id"] == geo_id) & (df[metric].isnull()), "timestamp"].values assert set(final_list) == set(test_list)
def test_county(self): """Tests that values are correctly aggregated at the county level.""" df = pd.DataFrame({ "fips": ["53003", "48027", "50103"], "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15"], "new_counts": [10, 15, 2], "cumulative_counts": [100, 20, 45], }) new_df = geo_map(df, "county", SENSOR) gmpr = GeoMapper() df = gmpr.add_population_column(df, "fips") exp_incidence = df["new_counts"] / df["population"] * 100000 exp_cprop = df["cumulative_counts"] / df["population"] * 100000 assert set(new_df["geo_id"].values) == set(df["fips"].values) assert set(new_df["timestamp"].values) == set(df["timestamp"].values) assert set(new_df["incidence"].values) == set(exp_incidence.values) assert set(new_df["cumulative_prop"].values) == set(exp_cprop.values)
def __init__(self): """Create the underlying GeoMapper.""" self.gmpr = GeoMapper() self.geo_func = { "county": partial(self.county_to_megacounty, threshold_visits=Config.MIN_RECENT_VISITS, threshold_len=Config.RECENT_LENGTH), "state": self.county_to_state, "msa": self.county_to_msa, "hrr": self.county_to_hrr, "hhs": self.county_to_hhs, "nation": self.county_to_nation }
def test_fill_missing_fips(self): gmpr = GeoMapper() test_input = pd.DataFrame({ "hospital_pk": ["test", "test2", "test3"], "fips_code": ["fakefips", np.nan, np.nan], "zip": ["01001", "01001", "00601"], "val1": [1.0, 5.0, 10.0], "val2": [2.0, 25.0, 210.0] }) expected = pd.DataFrame({ "hospital_pk": ["test", "test2", "test3", "test3"], "fips_code": ["fakefips", "25013", "72001", "72141"], "zip": ["01001", "01001", "00601", "00601"], "val1": [1.0, 5.0, 0.994345718901454 * 10, 0.005654281098546042 * 10], "val2": [ 2.0, 25.0, 0.994345718901454 * 210.0, 0.005654281098546042 * 210.0 ] }) pd.testing.assert_frame_equal(fill_missing_fips(test_input, gmpr), expected) # test all nans stay as nan test_input = pd.DataFrame({ "hospital_pk": ["test", "test2", "test3"], "fips_code": ["fakefips", np.nan, np.nan], "zip": ["01001", "01001", "00601"], "val1": [1.0, 5.0, np.nan], "val2": [2.0, 25.0, 210.0] }) expected = pd.DataFrame({ "hospital_pk": ["test", "test2", "test3", "test3"], "fips_code": ["fakefips", "25013", "72001", "72141"], "zip": ["01001", "01001", "00601", "00601"], "val1": [1.0, 5.0, np.nan, np.nan], "val2": [ 2.0, 25.0, 0.994345718901454 * 210.0, 0.005654281098546042 * 210.0 ] }) pd.testing.assert_frame_equal(fill_missing_fips(test_input, gmpr), expected) # test that populated fips or both nan is no-op test_input_no_missing = pd.DataFrame({ "hospital_pk": ["test", "test2", "test3", "test4"], "fips_code": ["fakefips", "testfips", "pseudofips", np.nan], "zip": ["01001", "01001", "00601", np.nan], "val": [1.0, 5.0, 10.0, 0.0] }) pd.testing.assert_frame_equal( fill_missing_fips(test_input_no_missing, gmpr), test_input_no_missing)
def run_module() -> None: """Run entire hhs_facilities indicator.""" params = read_params() raw_df = pull_data() gmpr = GeoMapper() filled_fips_df = fill_missing_fips(raw_df, gmpr) for geo, (sig_name, sig_cols, sig_func, sig_offset) in product(GEO_RESOLUTIONS, SIGNALS): mapped_df = convert_geo(filled_fips_df, geo, gmpr) output_df = generate_signal(mapped_df, sig_cols, sig_func, sig_offset) create_export_csv(output_df, params["export_dir"], geo, sig_name)
def run_module(): """Generate ground truth HHS hospitalization data.""" params = read_params() mapper = GeoMapper() request_all_states = ",".join(mapper.get_geo_values("state_id")) today = date.today() past_reference_day = date(year=2020, month=1, day=1) # first available date in DB date_range = generate_date_ranges(past_reference_day, today) dfs = [] for r in date_range: response = Epidata.covid_hosp(request_all_states, r) if response['result'] != 1: raise Exception(f"Bad result from Epidata: {response['message']}") dfs.append(pd.DataFrame(response['epidata'])) all_columns = pd.concat(dfs) for sig in SIGNALS: create_export_csv(make_signal(all_columns, sig), params["export_dir"], "state", sig)
def pull_data() -> pd.DataFrame: """ Pull HHS data from Epidata API for all states and dates and convert to a DataFrame. Returns ------- DataFrame of HHS data. """ today = int(date.today().strftime("%Y%m%d")) past_reference_day = int(date( 2020, 1, 1).strftime("%Y%m%d")) # first available date in DB all_states = GeoMapper().get_geo_values("state_id") responses = pull_data_iteratively(all_states, Epidata.range(past_reference_day, today)) all_columns = pd.DataFrame(responses).replace(NAN_VALUES, np.nan) all_columns["timestamp"] = pd.to_datetime(all_columns["collection_week"], format="%Y%m%d") return all_columns
def fill_missing_fips(df: pd.DataFrame, gmpr: GeoMapper) -> pd.DataFrame: """ Fill in missing FIPS code if zip is present. Maps rows that have the FIPS missing but zip present. The rest of the rows, including those where both FIPS and zip are nan, are kept as is and appended back at the end. Rows with a zip which fail to map to a FIPS are also kept so that column totals remain equal. This means that column sums before and after imputation should be identical, and any dropping of values is handled by downstream geomapping. TODO #636 Generalize this function to geomapper. Parameters ---------- df: pd.DataFrame Input DataFrame containing zip and fips columns. gmpr: GeoMapper object. Returns ------- DataFrame with missing FIPS imputed with zip. """ mask = pd.isna(df["fips_code"]) & ~pd.isna(df["zip"]) no_fips = df[mask] fips_present = df[~mask] no_data_cols = [ c for c in df.columns if df[c].dtypes not in (dtype("int64"), dtype("float64")) ] data_cols = list(set(df.columns) - set(no_data_cols)) added_fips = gmpr.add_geocode(no_fips, "zip", "fips", dropna=False) added_fips["fips_code"] = added_fips["fips"] # set weight of unmapped zips to 1 to they don't zero out all the values when multiplied added_fips.weight.fillna(1, inplace=True) added_fips[data_cols] = added_fips[data_cols].multiply( added_fips["weight"], axis=0) fips_filled = added_fips.groupby(no_data_cols, dropna=False, as_index=False).sum(min_count=1) fips_filled.drop(columns="weight", inplace=True) return pd.concat([fips_present, fips_filled]).reset_index(drop=True)
def run_module(params): """ Generate ground truth HHS hospitalization data. Parameters ---------- params Dictionary containing indicator configuration. Expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_filename" (optional): str, name of file to write logs """ start_time = time.time() logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) mapper = GeoMapper() request_all_states = ",".join(mapper.get_geo_values("state_id")) today = date.today() past_reference_day = date(year=2020, month=1, day=1) # first available date in DB date_range = generate_date_ranges(past_reference_day, today) dfs = [] for r in date_range: response = Epidata.covid_hosp(request_all_states, r) # The last date range might only have recent days that don't have any data, so don't error. if response["result"] != 1 and r != date_range[-1]: raise Exception(f"Bad result from Epidata: {response['message']}") if response["result"] == -2 and r == date_range[ -1]: # -2 code means no results continue dfs.append(pd.DataFrame(response['epidata'])) all_columns = pd.concat(dfs) geo_mapper = GeoMapper() for sig in SIGNALS: state = geo_mapper.add_geocode(make_signal(all_columns, sig), "state_id", "state_code", from_col="state") for geo in GEOS: create_export_csv(make_geo(state, geo, geo_mapper), params["common"]["export_dir"], geo, sig) elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds=elapsed_time_in_seconds)
def test_convert_geo(self): gmpr = GeoMapper() test_input = pd.DataFrame({ "state": ["test"], "fips_code": ["01001"], "zip": ["01001"], }) test_state_output = convert_geo(test_input, "state", gmpr) pd.testing.assert_series_equal(test_state_output.geo_id, pd.Series(["test"]), check_names=False) test_county_output = convert_geo(test_input, "county", gmpr) pd.testing.assert_series_equal(test_county_output.geo_id, pd.Series(["01001"]), check_names=False) test_msa_output = convert_geo(test_input, "msa", gmpr) pd.testing.assert_series_equal(test_msa_output.geo_id, pd.Series(["33860"]), check_names=False) test_hrr_output = convert_geo(test_input, "hrr", gmpr) pd.testing.assert_series_equal(test_hrr_output.geo_id, pd.Series(["230"]), check_names=False)
def test_make_geo(): """Check that geographies transform correctly.""" test_timestamp = datetime(year=2020, month=1, day=1) geo_mapper = GeoMapper() data = pd.DataFrame({ 'state': ['PA','WV','OH'], 'state_code': [42, 54, 39], 'timestamp': [test_timestamp]*3, 'val': [1, 2, 4], }) template = { 'se': np.nan, 'sample_size': np.nan, } expecteds = { "state": pd.DataFrame( dict(template, geo_id=data.state, timestamp=data.timestamp, val=data.val)), "hhs": pd.DataFrame( dict(template, geo_id=['3', '5'], timestamp=[test_timestamp]*2, val=[3, 4])), "nation": pd.DataFrame( dict(template, geo_id=['us'], timestamp=[test_timestamp], val=[7])) } for geo, expected in expecteds.items(): result = make_geo(data, geo, geo_mapper) for series in ["geo_id", "timestamp", "val", "se", "sample_size"]: pd.testing.assert_series_equal(expected[series], result[series], obj=f"{geo}:{series}")
def geomapper(): return GeoMapper()
class GeoMaps: """Class to map counties to other geographic resolutions.""" def __init__(self): """Create the underlying GeoMapper.""" self.gmpr = GeoMapper() self.geo_func = { "county": partial(self.county_to_megacounty, threshold_visits=Config.MIN_RECENT_VISITS, threshold_len=Config.RECENT_LENGTH), "state": self.county_to_state, "msa": self.county_to_msa, "hrr": self.county_to_hrr, "hhs": self.county_to_hhs, "nation": self.county_to_nation } @staticmethod def convert_fips(x): """Ensure fips is a string of length 5.""" return str(x).zfill(5) def county_to_msa(self, data): """Aggregate county data to the msa resolution. Args: data: dataframe aggregated to the daily-county resolution (all 7 cols expected) Returns: tuple of dataframe at the daily-msa resolution, and the geo_id column name """ data = self.gmpr.add_geocode(data, "fips", "msa", from_col="PatCountyFIPS", new_col="cbsa_id") data.drop(columns="PatCountyFIPS", inplace=True) data = data.groupby(["ServiceDate", "cbsa_id"]).sum().reset_index() return data.groupby("cbsa_id"), "cbsa_id" def county_to_state(self, data): """Aggregate county data to the state resolution. Args: data: dataframe aggregated to the daily-county resolution (all 7 cols expected) Returns: tuple of dataframe at the daily-state resolution, and geo_id column name """ data = self.gmpr.add_geocode(data, "fips", "state_id", from_col="PatCountyFIPS") data.drop(columns="PatCountyFIPS", inplace=True) data = data.groupby(["ServiceDate", "state_id"]).sum().reset_index() return data.groupby("state_id"), "state_id" def county_to_hhs(self, data): """Aggregate county data to the HHS region resolution. Args: data: dataframe aggregated to the daily-county resolution (all 7 cols expected) Returns: tuple of dataframe at the daily-HHS resolution, and geo_id column name """ data = self.gmpr.add_geocode(data, "fips", "hhs", from_col="PatCountyFIPS") data.drop(columns="PatCountyFIPS", inplace=True) data = data.groupby(["ServiceDate", "hhs"]).sum().reset_index() return data.groupby("hhs"), "hhs" def county_to_nation(self, data): """Aggregate county data to the nation resolution. Args: data: dataframe aggregated to the daily-county resolution (all 7 cols expected) Returns: tuple of dataframe at the daily-nation resolution, and geo_id column name """ data = self.gmpr.add_geocode(data, "fips", "nation", from_col="PatCountyFIPS") data.drop(columns="PatCountyFIPS", inplace=True) data = data.groupby(["ServiceDate", "nation"]).sum().reset_index() return data.groupby("nation"), "nation" def county_to_hrr(self, data): """Aggregate county data to the HRR resolution. Note that counties are not strictly contained within HRRs. When a county spans boundaries, we report it with the same rate in each containing HRR, but with a sample size weighted by how much it overlaps that HRR. Args: data: dataframe aggregated to the daily-county resolution (all 7 cols expected) Returns: tuple of (data frame at daily-HRR resolution, geo_id column name) """ data = self.gmpr.add_geocode(data, "fips", "hrr", from_col="PatCountyFIPS") data.drop(columns="PatCountyFIPS", inplace=True) ## do a weighted sum by the wpop column to get each HRR's contribution tmp = data.groupby(["ServiceDate", "hrr"]) wtsum = lambda g: g["weight"].values @ g[Config.COUNT_COLS] data = tmp.apply(wtsum).reset_index() return data.groupby("hrr"), "hrr" def county_to_megacounty(self, data, threshold_visits, threshold_len): """Convert to megacounty and groupby FIPS using GeoMapper package. Args: data: dataframe aggregated to the daily-county resolution (all 7 cols expected) threshold_visits: count threshold to determine when to convert to megacounty. threshold_len: number of days to use when thresholding. Returns: tuple of dataframe at the daily-state resolution, and geo_id column name """ all_data = self.gmpr.fips_to_megacounty(data, threshold_visits, threshold_len, fips_col="PatCountyFIPS", thr_col="Denominator", date_col="ServiceDate") all_data.rename({"megafips": "PatCountyFIPS"}, axis=1, inplace=True) megacounties = all_data[all_data.PatCountyFIPS.str.endswith("000")] data = pd.concat([data, megacounties]) return data.groupby("PatCountyFIPS"), "PatCountyFIPS"
def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None): """Pull the latest NCHS Mortality data, and conforms it into a dataset. The output dataset has: - Each row corresponds to (State, Week), denoted (geo_id, timestamp) - Each row additionally has columns 'covid_deaths', 'total_deaths', 'percent_of_expected_deaths', 'pneumonia_deaths', 'pneumonia_and_covid_deaths', 'influenza_deaths', 'pneumonia_influenza_or_covid_19_deaths' correspond to the aggregate metric from Feb. 1st until the latest date. # New York City would be included in New York State Parameters ---------- token: str My App Token for pulling the NCHS mortality data test_file: Optional[str] When not null, name of file from which to read test data Returns ------- pd.DataFrame Dataframe as described above. """ # Constants keep_columns = METRICS.copy() type_dict = {key: float for key in keep_columns} type_dict["timestamp"] = 'datetime64[ns]' if test_file: df = pd.read_csv("./test_data/%s"%test_file) else: # Pull data from Socrata API client = Socrata("data.cdc.gov", token) results = client.get("r8kw-7aab", limit=10**10) df = pd.DataFrame.from_records(results) # drop "By Total" rows df = df[df["group"].transform(str.lower) == "by week"] df = standardize_columns(df) if "end_date" in df.columns: # Check missing week_ending_date == end_date try: assert all(df["week_ending_date"] == df["end_date"]) except AssertionError as exc: raise ValueError( "week_ending_date is not always the same as end_date, check the raw file" ) from exc else: # Check missing start_week == end_week try: assert all(df["timestamp"] == df["end_week"]) except AssertionError as exc: raise ValueError( "end_week is not always the same as start_week, check the raw file" ) from exc try: df = df.astype(type_dict) except KeyError as exc: raise ValueError(f""" Expected column(s) missed, The dataset schema may have changed. Please investigate and amend the code. Columns needed: {NEWLINE.join(type_dict.keys())} Columns available: {NEWLINE.join(df.columns)} """) from exc # Drop rows for locations outside US df = df[df["state"] != "United States"] df = df.loc[:, keep_columns + ["timestamp", "state"]].set_index("timestamp") # NCHS considers NYC as an individual state, however, we want it included # in NY. If values are nan for both NYC and NY, the aggreagtion should # also have NAN. df_ny = df.loc[df["state"] == "New York", :].drop("state", axis=1) df_nyc = df.loc[df["state"] == "New York City", :].drop("state", axis=1) # Get mask df to ignore cells where both of them have NAN values mask = (df_ny[keep_columns].isnull().values \ & df_nyc[keep_columns].isnull().values) df_ny = df_ny.append(df_nyc).groupby("timestamp").sum().where(~mask, np.nan) df_ny["state"] = "New York" # Drop NYC and NY in the full dataset df = df.loc[~df["state"].isin(["New York", "New York City"]), :] df = df.append(df_ny).reset_index().sort_values(["state", "timestamp"]) # Add population info keep_columns.extend(["timestamp", "geo_id", "population"]) gmpr = GeoMapper() df = gmpr.add_population_column(df, "state_name", geocode_col="state") df = gmpr.add_geocode(df, "state_name", "state_id", from_col="state", new_col="geo_id") return df[keep_columns]
is only used for visualization. It sources Puerto Rico from jhu-csse and everything else from usa-facts. """ from datetime import date, timedelta, datetime from itertools import product import re import time import covidcast import pandas as pd from delphi_utils import add_prefix, get_structured_logger from delphi_utils.geomap import GeoMapper from .constants import METRICS, SMOOTH_TYPES, SENSORS, GEO_RESOLUTIONS GMPR = GeoMapper() COLUMN_MAPPING = { "time_value": "timestamp", "geo_value": "geo_id", "value": "val", "stderr": "se", "sample_size": "sample_size" } EMPTY_FRAME = pd.DataFrame({}, columns=COLUMN_MAPPING.values()) covidcast.covidcast._ASYNC_CALL = True # pylint: disable=protected-access def check_none_data_frame(data_frame, label, date_range):