class ALCountyVaccineAge(ALCountyVaccineSex): variable_columns = ["AGECAT"] sheet_num = 5 variables = { "16-54": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="16-54", ), "55-64": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="55-64", ), "65-74": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="65-74", ), "75+": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="75_plus", ), }
def pre_normalize(self, data) -> pd.DataFrame: df = self.arcgis_jsons_to_df(data) # Make columns names all-lowercase df.columns = [x.lower() for x in list(df)] df = df.rename(columns={"county": "location_name"}) crename = { "med_total": CMU(category="hospital_beds_capacity", measurement="current", unit="beds"), "covid_patients": CMU( category="hospital_beds_in_use_covid", measurement="current", unit="beds", ), "icu_total": CMU(category="icu_beds_capacity", measurement="current", unit="beds"), "icu_avail": CMU(category="icu_beds_available", measurement="current", unit="beds"), } df["dt"] = df["date"].map(self._esri_ts_to_dt) out = df.melt(id_vars=["location_name", "dt"], value_vars=crename.keys()).dropna() non_county_regions = [ "Pennsylvania", "East Central HCC", "HCC of Southwest PA", "Keystone HCC", "North Central HCC", "Northeast", "Northcentral", "Northeast HCC", "Northern Tier HCC", "Northwest", "Southcentral", "Southeast HCC", "Southeast", "Southwest", ] out = out[~out["location_name"].isin(non_county_regions)] out.loc[:, "value"] = pd.to_numeric(out["value"]) out = self.extract_CMU(out, crename) return out.loc[:, self.cols_to_keep]
def normalize(self, data): df = self.arcgis_jsons_to_df(data) df.columns = [x.lower() for x in list(df)] df["location"] = (self.state_fips * 1000) + df["county"].astype(int) # 12025 is the OLD (retired in 1997) fips code for Date county. It is now known # as Miami-Dade county with fips code 12086 df.loc[:, "location"] = df["location"].replace(12025, 12086) crename = { "casesall": CMU(category="cases", measurement="cumulative", unit="people"), "deaths": CMU(category="deaths", measurement="cumulative", unit="people"), "newpos": CMU( category="unspecified_tests_positive", measurement="new", unit="test_encounters", ), "newneg": CMU( category="unspecified_tests_negative", measurement="new", unit="test_encounters", ), "newtested": CMU( category="unspecified_tests_total", measurement="new", unit="test_encounters", ), } out = ( df.melt(id_vars=["location"], value_vars=crename.keys()) .assign( dt=self._retrieve_dt("US/Eastern"), vintage=self._retrieve_vintage() ) .query("location not in (12998, 12999)") .dropna() ) out.loc[:, "value"] = pd.to_numeric(out["value"]) # Extract category information and add other variable context out = self.extract_CMU(out, crename) cols_to_keep = [ "vintage", "dt", "location", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep]
def normalize(self, data: pd.DataFrame) -> pd.DataFrame: df = data.rename(columns={ "Test Date": "dt", "County": "location_name" }) crename = { "New Positives": CMU( category="unspecified_tests_positive", measurement="new", unit="test_encounters", ), "Total Number of Tests Performed": CMU( category="unspecified_tests_total", measurement="new", unit="test_encounters", ), "Cumulative Number of Positives": CMU( category="unspecified_tests_positive", measurement="cumulative", unit="test_encounters", ), "Cumulative Number of Tests Performed": CMU( category="unspecified_tests_total", measurement="cumulative", unit="test_encounters", ), } out = df.melt(id_vars=["dt", "location_name"], value_vars=crename.keys()).dropna() out.loc[:, "value"] = pd.to_numeric(out["value"]) # Determine the category and demographics of each observation out = self.extract_CMU(out, crename) out["vintage"] = self._retrieve_vintage() cols_to_keep = [ "vintage", "dt", "location_name", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep]
def normalize(self, data): df = self.arcgis_jsons_to_df(data) df.columns = [x.lower() for x in list(df)] df["location"] = (self.state_fips * 1000) + df["county_fip"].astype(int) crename = { "totalcasecount": CMU(category="cases", measurement="cumulative", unit="people"), "totaldeathcount": CMU(category="deaths", measurement="cumulative", unit="people"), "total_pop_tested": CMU( category="pcr_tests_total", measurement="cumulative", unit="unique_people", ), "total_testing_vol": CMU( category="pcr_tests_total", measurement="cumulative", unit="specimens", ), "daily_testing_vol": CMU( category="pcr_tests_total", measurement="new", unit="specimens", ), } out = (df.melt(id_vars=["location"], value_vars=crename.keys()).assign( dt=self._retrieve_dt("US/Eastern"), vintage=self._retrieve_vintage()).dropna()) out.loc[:, "value"] = pd.to_numeric(out["value"]) # Extract category information and add other variable context out = self.extract_CMU(out, crename) cols_to_keep = [ "vintage", "dt", "location", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep]
def normalize(self, data): df = self.arcgis_jsons_to_df(data) df.columns = [x.lower() for x in list(df)] df["location"] = df["geoid"].astype(int) crename = { "positive": CMU(category="cases", measurement="cumulative", unit="people"), "deaths": CMU(category="deaths", measurement="cumulative", unit="people"), "neg_new": CMU( category="pcr_tests_negative", measurement="new", unit="unique_people", ), "pos_new": CMU( category="pcr_tests_positive", measurement="new", unit="unique_people", ), "test_new": CMU( category="pcr_tests_total", measurement="new", unit="unique_people", ), } out = ( df.melt(id_vars=["location"], value_vars=crename.keys()) .assign( dt=self._retrieve_dt("US/Central"), vintage=self._retrieve_vintage() ) .dropna() ) out.loc[:, "value"] = pd.to_numeric(out["value"]) # Extract category information and add other variable context out = self.extract_CMU(out, crename) cols_to_keep = [ "vintage", "dt", "location", "category", "measurement", "unit", "age", "race", "sex", "value", ] return out.loc[:, cols_to_keep]
class MSCountyVaccine(StateDashboard): has_location = False source = "https://msdh.ms.gov/msdhsite/_static/resources/12130.pdf" location_type = "county" state_fips = int(us.states.lookup("Mississippi").fips) fetch_url = "https://msdh.ms.gov/msdhsite/_static/resources/12130.pdf" source_name = "Mississippi State Department of Health" variable_map = { "People Receiving at least One Dose**": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", ), "People Fully Vaccinated***": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", ), "Total Doses Administered": CMU( category="total_vaccine_doses_administered", measurement="cumulative", unit="doses", ), } def fetch(self): return camelot.read_pdf(self.fetch_url, pages="2", flavor="stream") def normalize(self, data): # Clean up dataframe from PDF. data = data[0].df header = data.iloc[1, :].reset_index(drop=True) data = data.iloc[2:].reset_index(drop=True) data.columns = header.to_list() data = self._rename_or_add_date_and_location( data, location_name_column="County of Residence", location_names_to_drop=["Total"], timezone="US/Central", ) data = self._reshape_variables(data, self.variable_map) data = data.replace({"location_name": {"Desoto": "DeSoto"}}) return data.dropna(subset=["value"])
def normalize(self, data): # retrieve and convert excel object to df, re-structure df data = pd.ExcelFile(data.content) df = self._wrangle(data.parse("Overal Stats")) crename = { "Total Overall Number of Tests": CMU( category="unspecified_tests_total", measurement="cumulative", unit="test_encounters", ), "Total Number of DC Residents Tested": CMU( category="unspecified_tests_total", measurement="cumulative", unit="unique_people", ), "Total ICU Beds in Hospitals": CMU( category="icu_beds_capacity", measurement="current", unit="beds", ), "ICU Beds Available": CMU( category="icu_beds_available", measurement="current", unit="beds", ), "Total Reported Ventilators in Hospitals": CMU( category="ventilators_capacity", measurement="current", unit="people", ), "In-Use Ventilators in Hospitals": CMU( category="ventilators_in_use", measurement="current", unit="people", ), "Available Ventilators in Hospitals": CMU( category="ventilators_available", measurement="current", unit="people", ), "Total COVID-19 Patients in ICU": CMU( ##check category="icu_beds_in_use_covid", measurement="current", unit="beds", ), } # return df in correct format for put() with new/renamed cols return self._reshape(df, crename)
def normalize(self, data): data = pd.ExcelFile(data.content) df = self._wrangle(data.parse("Total Cases by Race")) crename = { "All": CMU( category="cases", measurement="cumulative", unit="people", ), "Unknown": CMU( category="cases", measurement="cumulative", unit="people", race="unknown", ), "White": CMU(category="cases", measurement="cumulative", unit="people", race="white"), "Black/African American": CMU(category="cases", measurement="cumulative", unit="people", race="black"), "Asian": CMU(category="cases", measurement="cumulative", unit="people", race="asian"), "American Indian/Alaska Native": CMU( ##question? category="cases", measurement="cumulative", unit="people", race="native_american", ), "Native Hawaiin Pacific Islander": CMU( ##question? category="cases", measurement="cumulative", unit="people", race="pacific_islander", ), "Other/Multi-Racial": CMU( ##question? category="cases", measurement="cumulative", unit="people", race="multiple_other", ), } return self._reshape(df, crename)
class ALCountyVaccineSex(ALCountyVaccine): variables = { "F": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", sex="female", ), "M": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", sex="male", ), "U": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", sex="unknown", ), } variable_columns = ["RECIP_SEX"] sheet_num = 6 service = "Vaccination_Dashboard_AGOL_v4_PUBLIC_VIEW" def fetch(self): service = "Vaccination_Dashboard_AGOL_v4_PUBLIC_VIEW" return self.get_all_jsons(service, self.sheet_num, "7") def normalize(self, data): df = self.arcgis_jsons_to_df(data) df = df.pivot_table(index="CNTYFIPS", columns=self.variable_columns, values="COUNTS").reset_index() df = df.rename_axis(None, axis=1) df = self._rename_or_add_date_and_location(df, location_column="CNTYFIPS", timezone="US/Central") df = self._reshape_variables(df, self.variables) locations_to_drop = [0, 99999] df = df.query("location != @locations_to_drop") return df
class MontanaStateVaccine(MontanaCountyVaccine): location_type = "state" has_location = True crename = { "Total_Montanans_Immunized": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", ), "Total_Doses_Administered": CMU( category="total_vaccine_doses_administered", measurement="cumulative", unit="doses", ), } def fetch(self): return self.get_all_jsons("COVID_Vaccination_PRD_View", 1, "") def normalize(self, data): df = (self.arcgis_jsons_to_df(data).fillna(0).rename( columns={"Report_Date": "dt"})) df["dt"] = df["dt"].map(self._esri_ts_to_dt) df["location"] = self.state_fips out = self._transform_df(df) # this scraper has some duplicates. Drop them here return out.drop_duplicates( subset=[ "vintage", "dt", "location", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", ], keep="last", )
class ALCountyVaccineRace(ALCountyVaccineSex): variable_columns = ["RACE_LBL"] sheet_num = 4 variables = { "Native Hawaiian or other Pacific Islander": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="pacific_islander", ), "Two or More Races": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="multiple", ), "Other Race": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="other", ), "American Indian or Alaskan Native": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="ai_an", ), "White": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="white", ), "Unknown": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="unknown", ), "Black or African American": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="black", ), "Asian": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="asian", ), }
def pre_normalize(self, data) -> pd.DataFrame: df = self.arcgis_jsons_to_df(data) # Make columns names all-lowercase df.columns = [x.lower() for x in list(df)] df = df.rename(columns={"county": "location_name"}) crename = { "cases": CMU(category="cases", measurement="cumulative", unit="people"), "deaths": CMU(category="deaths", measurement="cumulative", unit="people"), # "probable": CMU( # category="cases_probable", # measurement="cumulative", # unit="people", # ), "negative": CMU( category="pcr_tests_negative", measurement="cumulative", unit="unique_people", ), "confirmed": CMU( category="pcr_tests_positive", measurement="cumulative", unit="unique_people", ), } out = (df.melt( id_vars=["location_name"], value_vars=crename.keys()).assign( dt=self._retrieve_dt("US/Eastern")).dropna().replace( dict(location_name=dict(Mckean="McKean")))) out.loc[:, "value"] = pd.to_numeric(out["value"]) # Extract category information and add other variable context out = self.extract_CMU(out, crename) return out.loc[:, self.cols_to_keep].query( "location_name != 'Pennsylvania'")
class TexasTests(TexasCasesDeaths): """ Get testing data on all TX counties from the TX ArcGIS dashboard """ service: str = "DSHS_COVID19_TestData_Service" crename: Dict[str, CMU] = { "ViralTest": CMU( category="pcr_tests_total", measurement="cumulative", unit="specimens" ), "AntibodyTe": CMU( category="antibody_tests_total", measurement="cumulative", unit="specimens", ), "Cumulative": CMU( category="unspecified_tests_total", measurement="cumulative", unit="unknown", ), }
class WisconsinVaccineStateAge(TableauDashboard): has_location = False source = "https://www.dhs.wisconsin.gov/covid-19/vaccine-data.htm#summary" source_name = "Wisconsin Department of Health Services" state_fips = int(us.states.lookup("Wisconsin").fips) baseurl = "https://bi.wisconsin.gov/t/DHS" viewPath = ( "VaccinesAdministeredtoWIResidents_16212677845310/VaccinatedWisconsin-County" ) timezone = "US/Central" data_tableau_table = "Age vax/unvax County" # age does not report missing/unknown entries missing_tableau_table = "" location_name_col = "AGG(Geography TT)-alias" location_type = "state" # map wide form column names into CMUs cmus = { "SUM(Initiation or completed count for TT)-alias": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", ) } def _get_demographic(self, df: pd.DataFrame, demo: str, demo_col_name: str) -> pd.DataFrame: """ description: a general "normalize" function to avoid extra/copied code each demographic uses this in its respective normalize params: demo: the demographic as labeled according to CMU (age,sex,race, etc...) demo_col_name: the name of the demographic column from the fetched data returns: normalized data in long format """ # county names (converted to title case) df["location_name"] = df[self.location_name_col].str.title() # fix county names df = df.replace({ "location_name": { "St Croix": "St. Croix", "Fond Du Lac": "Fond du Lac" } }) # parse out data columns value_cols = list(set(df.columns) & set(self.cmus.keys())) assert len(value_cols) == len(self.cmus) df = (df.melt(id_vars=[demo_col_name, "location_name"], value_vars=value_cols).dropna().assign( dt=self._retrieve_dt(self.timezone), vintage=self._retrieve_vintage(), value=lambda x: pd.to_numeric(x["value"].astype(str). str.replace(",", "")), ).pipe(self.extract_CMU, cmu=self.cmus)) df[demo] = df[demo_col_name] return df.drop(["variable", demo_col_name], axis=1) def fetch(self) -> pd.DataFrame: if self.missing_tableau_table: # extract both data table and missing data table dfs = [ self.get_tableau_view().get(table) for table in [self.data_tableau_table, self.missing_tableau_table] ] return pd.concat(dfs) else: return self.get_tableau_view()[self.data_tableau_table] def normalize(self, df: pd.DataFrame) -> pd.DataFrame: df = self._get_demographic(df, "age", "Age-value") return df.replace({"age": {"65+": "65_plus"}})
class WisconsinArcGIS(ArcGIS, ABC): """ ArcGIS scraper that retrieves dashboard information for the state of Wisconsin (which has their own self-hosted ArcGIS instance) """ has_location = True state_fips = int(us.states.lookup("Wisconsin").fips) source = "https://www.dhs.wisconsin.gov/covid-19/data.htm" location_type: str SERVICE: str = "DHS_COVID19/COVID19_WI" SHEET: int crename = { "positive": CMU( category="cases", measurement="cumulative", unit="people", ), "negative": CMU( category="pcr_tests_negative", measurement="cumulative", unit="unique_people", ), "pos_new": CMU( category="cases", measurement="new", unit="people", ), "neg_new": CMU( category="pcr_tests_negative", measurement="new", unit="unique_people", ), "test_new": CMU( category="pcr_tests_total", measurement="new", unit="unique_people", ), "deaths": CMU(category="deaths", measurement="cumulative", unit="people"), "dth_new": CMU(category="deaths", measurement="new", unit="people"), "hosp_yes": CMU( category="hospital_beds_in_use_covid", measurement="cumulative", unit="people", ), # sex "pos_fem": case_cmu(sex="female"), "pos_male": case_cmu(sex="male"), "dths_fem": deaths_cmu(sex="female"), "dths_male": deaths_cmu(sex="male"), # age "pos_0_9": case_cmu(age="0-9"), "pos_10_19": case_cmu(age="10-19"), "pos_20_29": case_cmu(age="20-29"), "pos_30_39": case_cmu(age="30-39"), "pos_40_49": case_cmu(age="40-49"), "pos_50_59": case_cmu(age="50-59"), "pos_60_69": case_cmu(age="60-69"), "pos_70_79": case_cmu(age="70-79"), "pos_80_89": case_cmu(age="80-89"), "pos_90": case_cmu(age="90_plus"), "dths_0_9": deaths_cmu(age="0-9"), "dths_10_19": deaths_cmu(age="10-19"), "dths_20_29": deaths_cmu(age="20-29"), "dths_30_39": deaths_cmu(age="30-39"), "dths_40_49": deaths_cmu(age="40-49"), "dths_50_59": deaths_cmu(age="50-59"), "dths_60_69": deaths_cmu(age="60-69"), "dths_70_79": deaths_cmu(age="70-79"), "dths_80_89": deaths_cmu(age="80-89"), "dths_90": deaths_cmu(age="90_plus"), # race and ethnicity "pos_aian": case_cmu(race="ai_an"), "pos_asn": case_cmu(race="asian"), "pos_blk": case_cmu(race="black"), "pos_wht": case_cmu(race="white"), "pos_mltoth": case_cmu(race="multiple_other"), "pos_unk": case_cmu(race="unknown"), "pos_e_hsp": case_cmu(ethnicity="hispanic"), "pos_e_nhsp": case_cmu(ethnicity="non-hispanic"), "pos_e_unk": case_cmu(ethnicity="unknown"), "dths_aian": deaths_cmu(race="ai_an"), "dths_asn": deaths_cmu(race="asian"), "dths_blk": deaths_cmu(race="black"), "dths_wht": deaths_cmu(race="white"), "dths_mltoth": deaths_cmu(race="multiple_other"), "dths_unk": deaths_cmu(race="unknown"), "dths_e_hsp": deaths_cmu(ethnicity="hispanic"), "dths_e_nhsp": deaths_cmu(ethnicity="non-hispanic"), "dths_e_unk": deaths_cmu(ethnicity="unknown"), } @abstractmethod def get_location(self, df: pd.DataFrame): pass def fetch(self): return self.get_all_jsons(self.SERVICE, self.SHEET, "server") def arcgis_query_url( self, service="DHS_COVID19/COVID19_WI", sheet=1, srvid="server", ): out = f"https://dhsgis.wi.gov/{srvid}/rest/services/{service}/MapServer/{sheet}/query" return out def normalize(self, data): df = self.arcgis_jsons_to_df(data) df.columns = [x.lower() for x in list(df)] df["location"] = self.get_location(df) value_cols = list(set(df.columns) & set(self.crename.keys())) out = ( df.melt(id_vars=["location"], value_vars=value_cols) .assign( dt=self._retrieve_dt("US/Central"), vintage=self._retrieve_vintage() ) .dropna() ) out.loc[:, "value"] = pd.to_numeric(out["value"]) # Extract category information and add other variable context out = self.extract_CMU(out, self.crename) cols_to_keep = [ "vintage", "dt", "location", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep]
def case_cmu(**kw): kwargs = dict(category="cases", measurement="cumulative", unit="people") kwargs.update(kw) return CMU(**kwargs)
def normalize(self, data): # retrieve data and convert dataframe structure data = pd.ExcelFile(data.content) df_age = self._wrangle(data.parse("Lives Lost by Age")) df_sex = self._wrangle(data.parse("Lives Lost by Sex")) df_race = self._wrangle(data.parse("Lives Lost by Race")) # maps for each df crename_age = { "<19": CMU( category="deaths", measurement="cumulative", unit="people", age="0-19", ), "20-29": CMU( category="deaths", measurement="cumulative", unit="people", age="20-29", ), "30-39": CMU( category="deaths", measurement="cumulative", unit="people", age="30-39", ), "40-49": CMU( category="deaths", measurement="cumulative", unit="people", age="40-49", ), "50-59": CMU( category="deaths", measurement="cumulative", unit="people", age="50-59", ), "60-69": CMU( category="deaths", measurement="cumulative", unit="people", age="60-69", ), "70-79": CMU( category="deaths", measurement="cumulative", unit="people", age="70-79", ), "80+": CMU( category="deaths", measurement="cumulative", unit="people", age="80_plus", ), } crename_sex = { "Female": CMU( category="deaths", measurement="cumulative", unit="people", sex="female", ), "Male": CMU( category="deaths", measurement="cumulative", unit="people", sex="male", ), } crename_race = { "Asian": CMU( category="deaths", measurement="cumulative", unit="people", race="asian", ), "Black/African American": CMU( category="deaths", measurement="cumulative", unit="people", race="black", ), "Hispanic/Latinx": CMU( category="deaths", measurement="cumulative", unit="people", race="all", ethnicity="hispanic", ), "Non-Hispanic White": CMU( category="deaths", measurement="cumulative", unit="people", race="white", ethnicity="non-hispanic", ), "Unknown": CMU( category="deaths", measurement="cumulative", unit="people", race="unknown", ), "All": CMU( category="deaths", measurement="cumulative", unit="people", ), } # rename and add columns according to map df_age = self._reshape(df_age, crename_age) df_sex = self._reshape(df_sex, crename_sex) df_race = self._reshape(df_race, crename_race) # combine all into one df df = pd.DataFrame() df = (df.append(df_age, ignore_index=True).append( df_sex, ignore_index=True).append(df_race, ignore_index=True)) # we have two dt that are pretty much identical # they are: # numpy.datetime64('2020-06-07T00:00:00.100000000') # numpy.datetime64('2020-06-07T00:00:00.000000000') # We drop one of them bad = df["dt"] == np.datetime64("2020-06-07T00:00:00.100000000") return df.loc[~bad, :]
"""Commonly used variables""" from can_tools.scrapers import CMU INITIATING_VACCINATIONS_ALL = CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", ) FULLY_VACCINATED_ALL = CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", ) TOTAL_DOSES_ADMINISTERED_ALL = CMU( category="total_vaccine_doses_administered", measurement="cumulative", unit="doses", ) PERCENTAGE_PEOPLE_INITIATING_VACCINE = CMU( category="total_vaccine_initiated", measurement="current", unit="percentage", ) PERCENTAGE_PEOPLE_COMPLETING_VACCINE = CMU( category="total_vaccine_completed", measurement="current",
def normalize(self, resjson: dict) -> pd.DataFrame: # Extract components we care about from json foo = resjson["results"][0]["result"]["data"] descriptor = foo["descriptor"]["Select"] data = foo["dsr"]["DS"][0]["PH"][1]["DM1"] # Build dict of dicts with relevant info col_names = [x["N"] for x in data[0]["S"]] col_mapping = {x["Value"]: x["Name"] for x in descriptor} # Iterate through all of the rows and store relevant data data_rows = [] for row in data: data_rows.append(row["C"]) # Dump records into a DataFrame df = (pd.DataFrame.from_records(data_rows, columns=col_names).rename( columns=col_mapping).rename(columns={"county": "location_name"})) # Reshape crename = { "doses_initiated": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", ), "doses_completed": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", ), "doses_administered": CMU( category="total_vaccine_doses_administered", measurement="cumulative", unit="doses", ), } out = df.melt(id_vars=["location_name"], value_vars=crename.keys()) # Add CMU, dt, vintage out = self.extract_CMU(out, crename) out["dt"] = self._retrieve_dt("US/Pacific") out["vintage"] = self._retrieve_vintage() cols_to_keep = [ "vintage", "dt", "location_name", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep]
def normalize(self, resjson: dict) -> pd.DataFrame: # Extract components we care about from json foo = resjson["results"][0]["result"]["data"] descriptor = foo["descriptor"]["Select"] data = foo["dsr"]["DS"][0]["PH"][1]["DM1"] # Build dict of dicts with relevant info col_mapping = {x["Value"]: x["Name"] for x in descriptor} col_keys = list(col_mapping.keys()) # Iterate through all of the rows and store relevant data data_rows = [] row_names = [col_mapping[desc["N"]] for desc in data[0]["S"]] for record in data: Crecord = record["C"] if "County" not in str(Crecord[0]): continue data_rows.append(record["C"]) # Dump records into a DataFrame df = pd.DataFrame.from_records(data_rows, columns=row_names) # Title case and remove the word county df["location_name"] = df["county"].str.replace("County, ME", "").str.strip() # Change into percentage for col in [ "total_vaccine_initiated_percent", "total_vaccine_completed_percent", ]: df.loc[:, col] = 100 * df.loc[:, col] # Reshape crename = { "total_vaccine_administered": CMU( category="total_vaccine_doses_administered", measurement="cumulative", unit="doses", ), "total_vaccine_initiated": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", ), "total_vaccine_completed": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", ), "total_vaccine_initiated_percent": CMU( category="total_vaccine_initiated", measurement="current", unit="percentage", ), "total_vaccine_completed_percent": CMU( category="total_vaccine_completed", measurement="current", unit="percentage", ), } out = df.melt(id_vars=["location_name"], value_vars=crename.keys()).dropna() # Add CMU, dt, vintage out = self.extract_CMU(out, crename) out["dt"] = self._retrieve_dt("US/Eastern") out["vintage"] = self._retrieve_vintage() cols_to_keep = [ "vintage", "dt", "location_name", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep]
def normalize(self, data): df = self.arcgis_jsons_to_df(data) df.columns = [x.lower() for x in list(df)] df["location"] = self.state_fips df = ( df.sort_values(by=["reportdate"], ascending=True) .tail(1) .reset_index(drop=True) ) df = df[ [ "location", "totalcases", "casedelta", "negativetests", "negdelta", "bedstotal", "bedsicu", "bedsdelta", "deaths", "deathsdelta", "totaltests", "testsdelta", "postestpercent", ] ] crename = { "totalcases": CMU( category="cases", measurement="cumulative", unit="people" ), "casedelta": CMU(category="cases", measurement="new", unit="people"), "negativetests": CMU( category="pcr_tests_negative", measurement="cumulative", unit="specimens", ), "negdelta": CMU( category="pcr_tests_negative", measurement="new", unit="specimens" ), "bedstotal": CMU( category="hospital_beds_in_use_covid", measurement="current", unit="beds", ), "bedsicu": CMU( category="icu_beds_in_use_covid", measurement="current", unit="beds" ), "bedsdelta": CMU( category="hospital_beds_in_use_covid", measurement="new", unit="beds" ), "deaths": CMU(category="deaths", measurement="cumulative", unit="people"), "deathsdelta": CMU(category="deaths", measurement="new", unit="people"), "totaltests": CMU( category="pcr_tests_total", measurement="cumulative", unit="specimens", ), "testsdelta": CMU( category="pcr_tests_total", measurement="new", unit="specimens", ), "postestpercent": CMU( category="pcr_tests_positive", measurement="rolling_average_7_day", unit="percentage", ), } out = ( df.melt(id_vars=["location"], value_vars=crename.keys()) .assign( dt=self._retrieve_dt("US/Eastern"), vintage=self._retrieve_vintage() ) .dropna() ) out.loc[:, "value"] = pd.to_numeric(out["value"]) # Extract category information and add other variable context out = self.extract_CMU(out, crename) cols_to_keep = [ "vintage", "dt", "location", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep]
def normalize(self, resjson: dict) -> pd.DataFrame: # Extract components we care about from json foo = resjson["results"][0]["result"]["data"] descriptor = foo["descriptor"]["Select"] data = foo["dsr"]["DS"][0]["PH"][1]["DM1"] # Build dict of dicts with relevant info col_mapping = {x["Value"]: x["Name"] for x in descriptor} col_keys = list(col_mapping.keys()) # Iterate through all of the rows and store relevant data data_rows = [] row_names = [col_mapping[desc["N"]] for desc in data[0]["S"]] for record in data: data_rows.append(record["C"]) # Dump records into a DataFrame df = pd.DataFrame.from_records(data_rows, columns=row_names) # Title case and remove the word county df["location_name"] = ( df["county"].str.title().str.replace("County", "").str.strip() ) df = df.query("~location_name.str.contains('Unknown')") # Rename certain counties df = df.replace( { "location_name": { "Lac Qui Parle": "Lac qui Parle", "Mcleod": "McLeod", "Lake Of The Woods": "Lake of the Woods", } } ) # Turn strings into numbers df["total_vaccine_initiated"] = pd.to_numeric( df["total_vaccine_initiated_display"].str.replace("L", "") ) df["total_vaccine_completed"] = pd.to_numeric( df["total_vaccine_completed_display"].str.replace("L", "") ) # Reshape crename = { "total_vaccine_initiated": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", ), "total_vaccine_completed": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", ), } out = df.melt(id_vars=["location_name"], value_vars=crename.keys()) # Add CMU, dt, vintage out = self.extract_CMU(out, crename) out["dt"] = self._retrieve_dt("US/Central") out["vintage"] = self._retrieve_vintage() cols_to_keep = [ "vintage", "dt", "location_name", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep].dropna()
def normalize(self, resjson): df, vds = self.normalize_preprocess(resjson) # Replace indexes with values county_replacer = {i: vd for i, vd in enumerate(vds["D0"])} coverage_replacer = {i: vd for i, vd in enumerate(vds["D1"])} dem_replacer = {i: vd for i, vd in enumerate(vds["D2"])} df = (df.query("coverage < 2").replace({ "county": county_replacer, "coverage": coverage_replacer, self.demographic: dem_replacer, }).rename( columns={ "county": "location_name", "coverage": "variable", "count": "value", }).replace({ "variable": { "Partially Covered": "total_vaccine_initiated", "Fully Covered": "total_vaccine_completed", }, self.demographic: self.value_renamer, }).pivot_table( index=["location_name", self.demographic], columns="variable", values="value", ).reset_index()) df = self.clean_pa_location_names(df) # Initiated is not at least one dose for PA df["total_vaccine_initiated"] = df.eval( "total_vaccine_initiated + total_vaccine_completed") df = df.melt(id_vars=["location_name", self.demographic]) # Reshape crename = { "total_vaccine_initiated": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", ), "total_vaccine_completed": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", ), } categories = [ "category", "measurement", "unit", "age", "sex", "race", "ethnicity", ] categories.remove(self.demographic) return self.normalize_postprocess(df, categories, crename)
def normalize(self, resjson): # Extract components we care about from json foo = resjson["results"][0]["result"]["data"] descriptor = foo["descriptor"]["Select"] data = foo["dsr"]["DS"][0]["PH"][0]["DM0"] # Build dict of dicts with relevant info col_mapping = {x["Value"]: x["Name"] for x in descriptor} col_keys = list(col_mapping.keys()) # Iterate through all of the rows and store relevant data data_rows = [] for record in data: flat_record = flatten_dict(record) row = {} for k in col_keys: flat_record_key = [ frk for frk in flat_record.keys() if k in frk ] if len(flat_record_key) > 0: row[col_mapping[k]] = flat_record[flat_record_key[0]] data_rows.append(row) # Dump records into a DataFrame df = pd.DataFrame.from_records(data_rows).dropna() df = df.query("location_name != '' & location_name != 'Out-of-State'") # Initiated is not at least one dose for PA -- it is a count of # individuals that are currently partially covered by a vaccine df["total_vaccine_initiated"] = df.eval( "total_vaccine_initiated + total_vaccine_completed") # Make sure McKean follows capitalization in db df = df.replace({"location_name": {"Mckean": "McKean"}}) # Reshape crename = { "total_vaccine_initiated": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", ), "total_vaccine_completed": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", ), } out = df.melt(id_vars=["location_name"]) # Add CMU, dt, vintage out = self.extract_CMU(out, crename) out["dt"] = self._retrieve_dt("US/Eastern") out["vintage"] = self._retrieve_vintage() cols_to_keep = [ "vintage", "dt", "location_name", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep]
class TXVaccineCountyAge(TexasVaccineParent): location_type = "county" has_location = False cmus = { "Doses Administered": CMU( category="total_vaccine_doses_administered", measurement="cumulative", unit="doses", ), "People Vaccinated with at least One Dose": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", ), "People Fully Vaccinated": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", ), } cmu_id_vars = ["age"] sheet_name = "By County, Age" replacers = { "age": { "16-49 years": "16-49", "50-64 years": "50-64", "65-79 years": "65-79", "80+ years": "80_plus", "Unknown": "unknown", "Total": "all", }, "race": { "American Indian/Alaskan Native": "ai_an", "Asian": "asian", "Black": "black", "Multiple Races": "multiple", "Native Hawaiian/Other Pacific Islander": "pacific_islander", "Other": "other", "Unknown Race": "unknown", "Unknown": "unknown", "White": "white", }, "ethnicity": { "Hispanic": "hispanic", "Not Hispanic": "non-hispanic", "Unknown": "unknown", }, "sex": { "Female": "female", "Male": "male", "Unknown": "unknown", }, } @property def cmu_columns(self): return list( set(["category", "measurement", "unit", "age", "race", "sex", "ethnicity"]) - set(self.cmu_id_vars) ) def normalize(self, data) -> pd.DataFrame: # Read in data, set location, and drop totals non_counties = ["Other", "Grand Total"] df = ( self.excel_to_dataframe(data, self.sheet_name) .rename(columns=str.strip) .rename( columns={ "Age Group": "age", "Race/Ethnicity": "race", "County Name": "location_name", } ) .rename(columns=str.strip) .melt( id_vars=["dt", "location_name"] + self.cmu_id_vars, value_vars=list(self.cmus.keys()), ) .replace(self.replacers) .pipe(self.extract_CMU, cmu=self.cmus, columns=self.cmu_columns) .pipe(lambda x: x.loc[~x["location_name"].isin(["*Other", "Total"]), :]) .assign(vintage=self._retrieve_vintage()) .query("location_name not in @non_counties") .dropna(subset=["value"]) ) return df
class TexasCasesDeaths(ArcGIS): """ Get cases and deaths data on all TX counties from the TX ArcGIS dashboard """ ARCGIS_ID = "ACaLB9ifngzawspq" source = ( "https://txdshs.maps.arcgis.com/apps/opsdashboard/index.html" "#/ed483ecd702b4298ab01e8b9cafc8b83" ) state_fips = int(us.states.lookup("Texas").fips) has_location = False service: str = "DSHS_COVID19_Cases_Service" crename: Dict[str, CMU] = { "Positive": CMU(category="cases", measurement="cumulative", unit="people"), "Fatalities": CMU(category="deaths", measurement="cumulative", unit="people"), } location_type = "county" def fetch(self) -> Any: return self.get_all_jsons(self.service, 0, 5) def normalize(self, data: Any) -> pd.DataFrame: """ Fetch county level cases and deaths data Returns ------- df: pd.DataFrame pandas DataFrame containing data on cases and deaths for all counties in TX """ # Load data and rename county/convert date df = self.arcgis_jsons_to_df(data).rename(columns={"County": "location_name"}) df["dt"] = self._retrieve_dt("US/Central") # Put into long format out = df.melt( id_vars=["location_name", "dt"], value_vars=self.crename.keys() ).dropna() out["value"] = out["value"].astype(int) out["vintage"] = self._retrieve_vintage() # Extract category information and add other variable context out = self.extract_CMU(out, self.crename) cols_to_keep = [ "vintage", "dt", "location_name", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep]
def normalize(self, data): data = (self.arcgis_jsons_to_df(data).fillna(0).rename( columns={"County": "location_name"})) data = self._get_clean_data(data) newest_date = data["dt"].max() # get cumulative data df = data.melt(id_vars=["dt", "location_name"], value_vars=self.var_columns).dropna() # sum total values by county for first and second dose # NOTE: This currently uses a sum to get the total number of # vaccines administered -- This depends on the fact # that we're only working with sex for now df1 = (df.query('variable.str.contains("dose_1")').groupby( "location_name", as_index=False)["value"].sum()) df1["category"] = "total_vaccine_initiated" df2 = (df.query('variable.str.contains("dose_2")').groupby( "location_name", as_index=False)["value"].sum()) df2["category"] = "total_vaccine_completed" # combine dfs and fill needed columns df = pd.concat( [ df1, df2, ], axis=0, ignore_index=True, ) cumulative_df = self._populate_cols(df, newest_date) # get weekly snapshots # create "total" 1st and 2nd dose columns by week weekly_df = self._get_clean_data(data) weekly_df["dose_1"] = (weekly_df["dose_1_male"] + weekly_df["dose_1_female"] + weekly_df["dose_1_sex_unknown"]) weekly_df["dose_2"] = (weekly_df["dose_2_male"] + weekly_df["dose_2_female"] + weekly_df["dose_2_sex_unknown"]) crename = { "dose_1": CMU( category="total_vaccine_initiated", measurement="new_7_day", unit="people", ), "dose_2": CMU( category="total_vaccine_completed", measurement="new_7_day", unit="people", ), "dose_1_male": CMU( category="total_vaccine_initiated", measurement="new_7_day", unit="people", sex="male", ), "dose_2_male": CMU( category="total_vaccine_completed", measurement="new_7_day", unit="people", sex="male", ), "dose_1_female": CMU( category="total_vaccine_initiated", measurement="new_7_day", unit="people", sex="female", ), "dose_2_female": CMU( category="total_vaccine_completed", measurement="new_7_day", unit="people", sex="female", ), "dose_1_sex_unknown": CMU( category="total_vaccine_initiated", measurement="new_7_day", unit="people", sex="unknown", ), "dose_2_sex_unknown": CMU( category="total_vaccine_completed", measurement="new_7_day", unit="people", sex="unknown", ), } weekly_df = weekly_df.melt(id_vars=["location_name", "dt"], value_vars=crename.keys()).dropna() weekly_df["value"] = weekly_df["value"].astype(int) weekly_df["vintage"] = self._retrieve_vintage() # Extract category information and add other variable context weekly_df = self.extract_CMU(weekly_df, crename) weekly_df = weekly_df.drop(columns={"variable"}) return pd.concat([cumulative_df, weekly_df], ignore_index=True)
def normalize(self, data): # read in data, remove extra header cols, rename column names dfs = [] for el in data: dfs.append(self._truncate_data(el.df)) df = pd.concat(dfs) # # Ignore data from unknown region (no fips code) and fix naming convention for problem counties, and total state vals df = df.query( "location_name != 'Unknown' &" "location_name != 'Out-Of-State' &" "location_name != 'Total'" ) df = df.replace({"location_name": {"Desoto": "DeSoto", "Dade": "Miami-Dade"}}) # Make all columns (except location) numeric for col in df.columns: if col == "location_name": continue else: df.loc[:, col] = pd.to_numeric(df.loc[:, col].str.replace(",", "")) # First dose and second dose need to be added together to get at least one vaccinated df.loc[:, "first_dose_total"] = df.eval( "first_dose_total + series_complete_total" ) crename = { "first_dose_new": CMU( category="total_vaccine_initiated", measurement="new", unit="people", ), "series_complete_new": CMU( category="total_vaccine_completed", measurement="new", unit="people", ), "total_people_vaccinated_new": CMU( category="total_vaccine_doses_administered", measurement="new", unit="doses", ), "first_dose_total": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", ), "series_complete_total": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", ), "total_people_vaccinated_total": CMU( category="total_vaccine_doses_administered", measurement="cumulative", unit="doses", ), } out = df.melt(id_vars=["location_name"], value_vars=crename.keys()).dropna() out = self.extract_CMU(out, crename) out["vintage"] = self._retrieve_vintage() out["dt"] = self._get_date() cols_to_keep = [ "vintage", "dt", "location_name", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep]
def normalize(self, data): # retrieve data and convert dataframe structure data = pd.ExcelFile(data.content) df_age = self._wrangle(data.parse("Lives Lost by Age")) df_sex = self._wrangle(data.parse("Lives Lost by Sex")) df_race = self._wrangle(data.parse("Lives Lost by Race")) # maps for each df crename_age = { "<19": CMU( category="deaths", measurement="cumulative", unit="people", age="0-19", ), "20-29": CMU( category="deaths", measurement="cumulative", unit="people", age="20-29", ), "30-39": CMU( category="deaths", measurement="cumulative", unit="people", age="30-39", ), "40-49": CMU( category="deaths", measurement="cumulative", unit="people", age="40-49", ), "50-59": CMU( category="deaths", measurement="cumulative", unit="people", age="50-59", ), "60-69": CMU( category="deaths", measurement="cumulative", unit="people", age="60-69", ), "70-79": CMU( category="deaths", measurement="cumulative", unit="people", age="70-79", ), "80+": CMU( category="deaths", measurement="cumulative", unit="people", age="80_plus", ), } crename_sex = { "Female": CMU( category="deaths", measurement="cumulative", unit="people", sex="female", ), "Male": CMU( category="deaths", measurement="cumulative", unit="people", sex="male", ), } crename_race = { "Asian": CMU( category="deaths", measurement="cumulative", unit="people", race="asian", ), "Black/African American": CMU( category="deaths", measurement="cumulative", unit="people", race="black", ), "Hispanic/Latinx": CMU( category="deaths", measurement="cumulative", unit="people", race="hispanic", ), "Non-Hispanic White": CMU( category="deaths", measurement="cumulative", unit="people", race="white", ), "Unknown": CMU( category="deaths", measurement="cumulative", unit="people", race="unknown", ), "All": CMU( category="deaths", measurement="cumulative", unit="people", ), } # rename and add columns according to map df_age = self._reshape(df_age, crename_age) df_sex = self._reshape(df_sex, crename_sex) df_race = self._reshape(df_race, crename_race) # combine all into one df df = pd.DataFrame() df = (df.append(df_age, ignore_index=True).append( df_sex, ignore_index=True).append(df_race, ignore_index=True)) return df