def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename( dataframes[0], { "e(0)": "life_expectancy", "STATE2KX": "state_code", "CNTY2KX": "county_code" }, drop=True, ) # Derive the FIPS subregion code from state and county codes data["state_code"] = data["state_code"].apply( lambda x: numeric_code_as_string(x, 2)) data["county_code"] = data["county_code"].apply( lambda x: numeric_code_as_string(x, 3)) data["subregion2_code"] = data["state_code"] + data["county_code"] # Data is more granular than county level, use a crude average for estimate data = (data.drop(columns=["state_code", "county_code"]).groupby( "subregion2_code").mean().reset_index()) # Add country code to all records and return data["country_code"] = "US" return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_multimerge( [ table_rename( dataframes["confirmed"], { "Fecha": "date", "Casos confirmados": "new_confirmed", "Codigo region": "subregion1_code", "Codigo comuna": "subregion2_code", }, drop=True, ), table_rename( dataframes["deceased"], { "Fecha": "date", "Casos fallecidos": "total_deceased", "Codigo region": "subregion1_code", "Codigo comuna": "subregion2_code", }, drop=True, ), ], how="outer", ) # Convert date to ISO format data["date"] = data["date"].astype(str) # Parse region codes as strings data["subregion1_code"] = data["subregion1_code"].apply( lambda x: numeric_code_as_string(x, 2) ) data["subregion2_code"] = data["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 5) ) # Use proper ISO codes for the subregion1 level data["subregion1_code"] = data["subregion1_code"].apply(_SUBREGION1_CODE_MAP.get) # Extract cities from the municipalities city = _extract_cities(data) # We can build the key for the data directly from the subregion codes data["key"] = "CL_" + data["subregion1_code"] + "_" + data["subregion2_code"] # Drop bogus records from the data data.dropna(subset=["subregion1_code", "subregion2_code"], inplace=True) return concat([data, city])
def _process_partition(cases: DataFrame) -> DataFrame: cases = cases.copy() # Extract information about whether doses were first (partial immunization) or second (full) cases["date_new_persons_vaccinated"] = None cases["date_new_persons_fully_vaccinated"] = None first_dose_mask = cases["_dose_information"].str.strip().str.slice( 0, 1) == "1" second_dose_mask = cases["_dose_information"].str.strip().str.slice( 0, 1) == "2" cases.loc[first_dose_mask, "date_new_persons_vaccinated"] = cases.loc[ first_dose_mask, "date_new_vaccine_doses_administered"] cases.loc[second_dose_mask, "date_new_persons_fully_vaccinated"] = cases.loc[ second_dose_mask, "date_new_vaccine_doses_administered"] # Drop columns which we have no use for cases = cases[[col for col in cases.columns if not col.startswith("_")]] # Make sure our region codes are of type str cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 6)) # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = cases["sex"].str.lower().apply({ "m": "male", "f": "female" }.get) # Convert to time series format data = convert_cases_to_time_series( cases, index_columns=["subregion1_code", "subregion2_code"]) # Convert date to ISO format data["date"] = data["date"].str.slice(0, 10) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) # Get rid of bogus records data = data.dropna(subset=["date"]) data = data[data["date"] >= "2020-01-01"] data = data[data["date"] < date_today(offset=1)] # Aggregate data by country country = aggregate_admin_level(data, ["date", "age", "sex"], "country") country["key"] = "BR" # Aggregate data by state state = (data.drop(columns=["subregion2_code"]).groupby( ["date", "subregion1_code", "age", "sex"]).sum().reset_index()) state["key"] = "BR_" + state["subregion1_code"] # We can derive the key from subregion1 + subregion2 data = data[data["subregion2_code"].notna() & (data["subregion2_code"] != "")] data["key"] = "BR_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] return concat([country, state, data])
def _process_state(data: DataFrame) -> DataFrame: data["date"] = data["date"].apply(lambda x: str(x)[:10]) data["subregion2_code"] = data["fips_code"].apply( lambda x: numeric_code_as_string(x, 5)) data["key"] = "US_" + data["state"] + "_" + data["subregion2_code"] data.drop( columns=[ "subregion2_code", "state", "fips_code", "county", "report_date_window_end", "report_date_window_start", ], inplace=True, ) # Make sure the data is properly sorted, since we need to compute diffs data.sort_values(["key", "date"], inplace=True) # Get a mapping between rolling average column names and their daily counterparts col_prefixes = ( "new_cases", "new_deaths", "new_test_results_reported", "admissions_covid_confirmed", ) rolling_suffix = "_7_day_rolling_average" rolling_columns_map = { col + rolling_suffix: col.replace(rolling_suffix, "") for col in col_prefixes } # Seed the daily versions of the columns with empty values for name in rolling_columns_map.values(): data[name] = None # Convert the rolling average columns to daily values one key at a time # This can probably be done with some clever grouping function instead, but iteratively is # fast enough and it works reliably. for key in pbar(data["key"].unique(), desc="Computing daily values from rolling means"): mask = data["key"] == key for col, name in rolling_columns_map.items(): subset = data.loc[mask, col].dropna() data.loc[subset.index, name] = recover_from_rolling_mean(subset, 7) # Get rid of unnecessary columns now that we have the daily values data.drop(columns=rolling_columns_map.keys(), inplace=True) return table_rename( data, { "new_cases": "new_confirmed", "new_deaths": "new_deceased", "new_test_results_reported": "new_tested", "admissions_covid_confirmed": "new_hospitalized", }, )
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = table_rename(dataframes[0], _srag_column_adapter, drop=True) covid_mask = cases["_classification"] == 5 valid_mask = cases["_prognosis"].notna() & cases["_prognosis"] != 9 cases = cases[covid_mask & valid_mask] # Record the date of death cases["date_new_deceased"] = None deceased_mask = cases["_prognosis"] == 2 cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "_date_prognosis"] # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = cases["sex"].apply({"M": "male", "F": "female"}.get) # Convert all dates to ISO format for col in filter(lambda x: x.startswith("date"), cases.columns): cases[col] = cases[col].apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # Parse subregion codes cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 5)) # Convert to time series format data = convert_cases_to_time_series(cases, index_columns=["subregion2_code"]) data["country_code"] = "BR" # Get rid of bogus records data = data.dropna(subset=["date"]) data = data[data["date"] >= "2020-01-01"] data = data[data["date"] < date_today(offset=1)] # Aggregate by country level country = (data.drop(columns=["subregion2_code"]).groupby( ["date", "age", "sex"]).sum().reset_index()) country["key"] = "BR" # Aggregate by state level data["subregion1_code"] = data["subregion2_code"].apply( lambda x: _IBGE_STATES.get(safe_int_cast(x[:2]))) state = (data.drop(columns=["subregion2_code"]).dropna( subset=["subregion1_code"]).groupby( ["date", "subregion1_code", "age", "sex"]).sum().reset_index()) state["key"] = "BR_" + state["subregion1_code"] # Derive the key from subregion codes data = data[data["subregion2_code"].notna()] data["key"] = "BR_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] return concat([country, state, data])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename( dataframes[0], { "TipusCasData": "date", # "ComarcaCodi": "comarca_code", # "ComarcaDescripcio": "comarca_name", "MunicipiCodi": "subregion2_code", "MunicipiDescripcio": "subregion2_name", "SexeCodi": "sex", # "SexeDescripcio": "sex", "TipusCasDescripcio": "_case_type", "NumCasos": "new_confirmed", }, drop=True, ) # Remove "suspect" cases data = data[data["_case_type"] != "Sospitós"].drop( columns=["_case_type"]) # Use placeholder code for unknown values data.loc[data["subregion2_code"].isna(), "subregion2_code"] = "00000" # Region codes need cleaning up to match INEI codes data["subregion2_code"] = data["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 5)) # Derive key from subregion code data["key"] = "ES_CT_" + data["subregion2_code"] # Parse sex, date and numeric values sex_adapter = {"0": "male", "1": "female"} data["sex"] = data["sex"].apply( lambda x: sex_adapter.get(x, "sex_unknown")) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) data["new_confirmed"] = data["new_confirmed"].apply(safe_int_cast) # Aggregate manually since some municipalities are clumped together if they are too small ccaa = data.drop(columns=["subregion2_code"]).groupby( ["date", "sex"]).sum().reset_index() ccaa["key"] = "ES_CT" # Remove unnecessary data data = data[data["key"] != "ES_CT_00000"] return concat([ccaa, data])
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_rename( dataframes[0], { "data": "date", "estado": "subregion1_code", "codmun": "subregion2_code", "municipio": "subregion2_name", "casosNovos": "new_confirmed", "obitosNovos": "new_deceased", "casosAcumulado": "total_confirmed", "obitosAcumulado": "total_deceased", "Recuperadosnovos": "total_recovered", }, drop=True, ) # Convert date to ISO format data["date"] = data["date"].astype(str) # Parse region codes as strings data["subregion2_code"] = data["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 6) ) # Country-level data has null state data["key"] = None country_mask = data["subregion1_code"].isna() data.loc[country_mask, "key"] = "BR" # State-level data has null municipality state_mask = data["subregion2_code"].isna() data.loc[~country_mask & state_mask, "key"] = "BR_" + data["subregion1_code"] # We can derive the key from subregion1 + subregion2 data.loc[~country_mask & ~state_mask, "key"] = ( "BR_" + data["subregion1_code"] + "_" + data["subregion2_code"] ) # Drop bogus data data = data[data["subregion2_code"].str.slice(-4) != "0000"] return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_rename( dataframes[0], { "fecha_informe": "date", "municipio_distrito": "subregion2_name", "codigo_geometria": "subregion2_code", "casos_confirmados_totales": "total_confirmed", }, drop=True, ) # Use placeholder code for unknown values data.loc[data["subregion2_code"].isna(), "subregion2_code"] = "000000" # Region codes need cleaning up to match INEI codes data["subregion2_code"] = data["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 6) ) data["subregion2_code"] = data["subregion2_code"].apply( lambda x: "28" + ("079" + x[4:] if x.startswith("079") else x[2:5] + x[6:]) ) data["key"] = "ES_MD_" + data["subregion2_code"] data = data.drop(columns=["subregion2_code"]) data["date"] = data["date"].apply(lambda x: datetime_isoformat(x[:10], "%Y/%m/%d")) data["total_confirmed"] = data["total_confirmed"].apply(safe_int_cast) # Aggregate the entire autonomous community l1 = data.drop(columns=["key", "subregion2_name"]).groupby("date").sum().reset_index() l1["key"] = "ES_MD" # Sometimes the subregion code is not properly formatted, so we may need to do string match data["country_code"] = "ES" data["subregion1_code"] = "MD" data["subregion2_name"] = data["subregion2_name"].str.replace("Madrid-", "") data.loc[data["key"] == "ES_MD_28000", "key"] = None return concat([data, l1])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Data is nested into multiple sheets tables = [] for df in list(dataframes[0].values())[1:]: # Header has two rows, but we ignore them and use our own columns anyway df.columns = _columns df = df.iloc[2:].copy() # Make sure subregion code is numeric apply_func = lambda x: numeric_code_as_string(x, 2) df["subregion1_code"] = df["subregion1_code"].apply(apply_func) # Keep only new_confirmed df = df[["date", "subregion1_code"] + parse_opts["columns"]] # Keep only rows with indexable columns not null df.dropna(subset=["date", "subregion1_code"], inplace=True) # This data source is "complete" so all nulls are zeroes df = df.fillna(0) # Add to the tables including all subregions tables.append(df.iloc[1:]) # Put all sheets together into a single DataFrame data = concat(tables) # Derive the key from country and region code data["key"] = parse_opts["country"] + "_" + data["subregion1_code"] data.drop(columns=["subregion1_code"], inplace=True) # Ensure date is in ISO format data["date"] = data["date"].apply(lambda x: str(x)[:10]) # Make sure that all data is numeric for col in data.columns: if col not in ("date", "key"): data[col] = data[col].apply(safe_int_cast) # Output the results return data
def _rename_columns(data: DataFrame) -> DataFrame: column_adapter = { "date": "date", "country_region_code": "country_code", "sub_region_1": "subregion1_name", "sub_region_2": "subregion2_name", "sub_region_1_code": "subregion1_code", "sub_region_2_code": "subregion2_code", } data = data.rename(columns=column_adapter) data["subregion1_code"] = data["subregion1_code"].apply( lambda x: x.split("-")[-1] if x else None) data["subregion2_code"] = data["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 5)) data.columns = [ col if col in column_adapter.values() else "search_trends_" + col.lower().replace("symptom:", "").replace(" ", "_").replace("'", "") for col in data.columns ] return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = table_rename( dataframes[0], { # "FECHA_ACTUALIZACION": "", # "ID_REGISTRO": "", # "ORIGEN": "", # "SECTOR": "", # "ENTIDAD_UM": "", "SEXO": "sex", # "ENTIDAD_NAC": "", "ENTIDAD_RES": "subregion1_code", "MUNICIPIO_RES": "subregion2_code", "TIPO_PACIENTE": "_type", "FECHA_INGRESO": "date_new_confirmed", # "FECHA_SINTOMAS": "", "FECHA_DEF": "date_new_deceased", # "INTUBADO": "", # "NEUMONIA": "", "EDAD": "age", # "NACIONALIDAD": "", # "EMBARAZO": "", # "HABLA_LENGUA_INDIG": "", # "DIABETES": "", # "EPOC": "", # "ASMA": "", # "INMUSUPR": "", # "HIPERTENSION": "", # "OTRA_COM": "", # "CARDIOVASCULAR": "", # "OBESIDAD": "", # "RENAL_CRONICA": "", # "TABAQUISMO": "", # "OTRO_CASO": "", "RESULTADO": "_diagnosis", # "MIGRANTE": "", # "PAIS_NACIONALIDAD": "", # "PAIS_ORIGEN": "", "UCI": "_intensive_care", }, drop=True, ) # Null dates are coded as 9999-99-99 for col in cases.columns: if col.startswith("date_"): cases.loc[cases[col] == "9999-99-99", col] = None # Discard all cases with negative test result cases = cases[cases["_diagnosis"] == 1] # Type 1 is normal, type 2 is hospitalized cases["date_new_hospitalized"] = None hospitalized_mask = cases["_type"] == 2 cases.loc[hospitalized_mask, "date_new_hospitalized"] = cases.loc[hospitalized_mask, "date_new_confirmed"] # Parse region codes as strings cases["subregion1_code"] = cases["subregion1_code"].apply( lambda x: numeric_code_as_string(x, 2)) cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 3)) # Convert case line data to our time series format data = convert_cases_to_time_series( cases, ["subregion1_code", "subregion2_code"]) # Convert date to ISO format data["date"] = data["date"].astype(str) # Unknown region codes are defined as "99+" instead of null data.loc[data["subregion1_code"] == "99", "subregion1_code"] = None data.loc[data["subregion2_code"] == "999", "subregion2_code"] = None # The subregion2 codes need to be composed invalid_region_mask = data["subregion2_code"].isna( ) | data["subregion2_code"].isna() data.loc[~invalid_region_mask, "subregion2_code"] = ( data.loc[~invalid_region_mask, "subregion1_code"] + data.loc[~invalid_region_mask, "subregion2_code"]) # Use proper ISO codes for the subregion1 level data["subregion1_code"] = data["subregion1_code"].apply( _SUBREGION1_CODE_MAP.get) # Translate sex labels; only male, female and unknown are given data["sex"] = data["sex"].apply(lambda x: { "hombre": "male", "mujer": "female" }.get(x.lower())) # Aggregate state-level data by adding all municipalities state = data.drop(columns=["subregion2_code"]).groupby( ["date", "subregion1_code"]).sum() state.reset_index(inplace=True) state["key"] = "MX_" + state["subregion1_code"] # Extract cities from the municipalities city = _extract_cities(data) # Country level is called "TOTAL" as a subregion1_code country_mask = data["subregion1_code"] == "TOTAL" country = data[country_mask] country["key"] = "MX" # We can build the key for the data directly from the subregion codes data["key"] = "MX_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] # Drop bogus records from the data data = data[~country_mask] state.dropna(subset=["subregion1_code"], inplace=True) data.dropna(subset=["subregion1_code", "subregion2_code"], inplace=True) return concat([country, state, data, city])
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: # Rename appropriate columns data = table_rename( dataframes[0], { "town_code": "subregion2_code", "date": "date", "accumulated_tested": "total_tested", "new_tested_on_date": "_new_tested_flag", "accumulated_cases": "total_confirmed", "new_cases_on_date": "_new_confirmed_flag", "accumulated_recoveries": "total_recovered", "new_recoveries_on_date": "_new_recovered_flag", "accumulated_hospitalized": "total_hospitalized", "new_hospitalized_on_date": "_new_hospitalized_flag", "accumulated_deaths": "total_deceased", "new_deaths_on_date": "_new_deceased_flag", "accumulated_vaccination_first_dose": "total_persons_vaccinated", "accumulated_vaccination_second_dose": "total_persons_fully_vaccinated", "town": "match_string", }, drop=True, ) # Convert date to ISO format and sort the data data["date"] = data["date"].astype(str).str.slice(0, 10) data.sort_values("date", inplace=True) # Because low counts are masked, we assume <15 = 1 as a rough estimate for statistic in ( "confirmed", "deceased", "tested", "recovered", "hospitalized", "persons_vaccinated", "persons_fully_vaccinated", ): col = f"total_{statistic}" if col in data.columns: low_count_mask = data[col] == "<15" data.loc[low_count_mask, col] = 1 # We can fill the data with zeroes since every case should be recorded by source data[col] = data[col].apply(safe_int_cast).fillna(0) # Estimate total vaccine doses administered from first and second dose counts data["total_vaccine_doses_administered"] = ( data["total_persons_vaccinated"] + data["total_persons_fully_vaccinated"] ) # Properly format the region code and group by it data["subregion2_code"] = data["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 4) ) data = data.groupby(["date", "subregion2_code", "match_string"]).sum().reset_index() # Aggregate to country level and drop unknown locations data["country_code"] = "IL" intra_country_columns = ["subregion2_code", "match_string"] country = data.drop(columns=intra_country_columns) country = data.groupby("country_code").sum().reset_index() data.dropna(subset=intra_country_columns, inplace=True) # Drop country-level confirmed and deceased since we have better sources of aggregated data country["key"] = country["country_code"] country.drop(columns=["total_confirmed", "total_deceased"], inplace=True) # Get the admin level 1 and key from metadata il = aux["metadata"][["key", "country_code", "subregion1_code", "subregion2_code"]] il = il[(il["country_code"] == "IL") & il["subregion2_code"].notna()] il["subregion2_code"] = il["subregion2_code"].apply(lambda x: numeric_code_as_string(x, 4)) data = data.merge(il, how="left") # Aggregate by admin level 1 admin_l1 = data.groupby(["date", "country_code", "subregion1_code"]).sum().reset_index() admin_l1["key"] = admin_l1["country_code"] + "_" + admin_l1["subregion1_code"] return concat([country, admin_l1, data])
def _process_cache_file(file_map: Dict[str, str], date: str) -> DataFrame: data = table_rename(read_file(file_map[date]), _column_adapter, drop=True) data["subregion1_code"] = data["subregion1_code"].apply( lambda x: _ISO_CODE_MAP.get(numeric_code_as_string(x, 2) or "00")) data["date"] = date return data
def _subregion1_code_converter(code: int): return _region_code_map.get(numeric_code_as_string(code, 2))
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename appropriate columns cases = table_rename( dataframes[0], { "residencia_provincia_id": "subregion1_code", "residencia_departamento_id": "subregion2_code", "fecha_fallecimiento": "date_new_deceased", "fecha_diagnostico": "_date_diagnosed", "fecha_internacion": "date_new_hospitalized", "fecha_cui_intensivo": "date_new_intensive_care", "clasificacion_resumen": "_classification", "edad": "age", "sexo": "sex", }, drop=True, ) # As long as a case is not labeled as "suspected", assume it has been tested cases["date_new_tested"] = None suspect_mask = cases["_classification"].str.lower().str.match( ".*sospechoso.*") cases.loc[~suspect_mask, "date_new_tested"] = cases.loc[suspect_mask, "_date_diagnosed"] # Get rid of all the suspected cases, since we have nothing to tally for them cases = cases[~suspect_mask] # Confirmed cases use the label "confirmado" cases["date_new_tested"] = None confirmed_mask = cases["_classification"].str.lower().str.match( ".*confirmado.*") cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[confirmed_mask, "_date_diagnosed"] # Clean up the subregion codes cases["subregion1_code"] = cases["subregion1_code"].apply( lambda x: None if x == 0 else numeric_code_as_string(x, 2)) cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: None if x == 0 else numeric_code_as_string(x, 3)) # Convert subregion1_code to the corresponding ISO code cases["subregion1_code"] = cases["subregion1_code"].apply( _ISO_CODE_MAP.get) # Remove unnecessary columns before converting to time series cases = cases.drop( columns=[col for col in cases.columns if col.startswith("_")]) # Go from individual case records to key-grouped records in time series format data = convert_cases_to_time_series( cases, ["subregion1_code", "subregion2_code"]) # Parse dates to ISO format. data["date"] = data["date"].astype(str) # Aggregate by province and report that separately provinces = (data.drop(columns=["subregion2_code"]).groupby( ["subregion1_code", "date", "age", "sex"]).sum().reset_index()) # Aggregate to the country level and report that separately country = (data.drop(columns=["subregion1_code"]).groupby( ["date", "age", "sex"]).sum().reset_index()) # Compute the key from the subregion codes country["key"] = "AR" provinces["key"] = "AR_" + provinces["subregion1_code"] data["key"] = "AR_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] # Remove bogus values for df in (country, provinces, data): df.drop(df[df["key"].str.endswith("_")].index, inplace=True) for nn_col in ("date", "subregion1_code", "subregion2_code"): if nn_col in df.columns: df.drop(df[df[nn_col].isna() | (df[nn_col] == "")].index, inplace=True) return concat([data, provinces, country])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename appropriate columns data = table_rename( dataframes[0], { "sexo": "sex", "grupo_etario": "age", # "jurisdiccion_residencia": "", "jurisdiccion_residencia_id": "subregion1_code", # "depto_residencia": "", "depto_residencia_id": "subregion2_code", # "jurisdiccion_aplicacion": "", # "jurisdiccion_aplicacion_id": "", # "depto_aplicacion": "", # "depto_aplicacion_id": "", "fecha_aplicacion": "date", "vacuna": "_manufacturer", # "condicion_aplicacion": "", "orden_dosis": "_dose_number", # "lote_vacuna": "", }, drop=True, ) # Parse dates to ISO format. data["date"] = data["date"].astype(str) # Parse sex label into proper name data["sex"] = data["sex"].apply({"M": "male", "F": "female"}.get) # Parse the dose number assuming all vaccines have a 2-dose schedule data["new_persons_vaccinated"] = data["_dose_number"].apply( lambda x: 1 if x == 1 else 0) data["new_persons_fully_vaccinated"] = data["_dose_number"].apply( lambda x: 1 if x == 2 else 0) data["new_vaccine_doses_administered"] = ( data["new_persons_vaccinated"] + data["new_persons_fully_vaccinated"]) # Add a column for each vaccine manufacturer for manufacturer in data["_manufacturer"].unique(): mask = data["_manufacturer"] == manufacturer brand_name = manufacturer.lower() cols = [f"new_persons_{mod}vaccinated" for mod in ["", "fully_"]] cols += [f"new_vaccine_doses_administered"] for col in cols: new_col = f"{col}_{brand_name}" data[new_col] = None data.loc[mask, new_col] = data.loc[mask, col] # Clean up the subregion codes data["subregion1_code"] = data["subregion1_code"].apply( lambda x: numeric_code_as_string(x, 2) or "00") data["subregion2_code"] = data["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 3) or "000") # Convert subregion1_code to the corresponding ISO code data["subregion1_code"] = data["subregion1_code"].apply( _ISO_CODE_MAP.get) # Group by indexable columns idx_cols = ["date", "subregion1_code", "subregion2_code", "sex", "age"] data = data.groupby(idx_cols).sum().reset_index() # Aggregate country level and admin level 1 country = aggregate_admin_level(data, ["date", "age", "sex"], "country") subregion1 = aggregate_admin_level(data, ["date", "age", "sex"], "subregion1") subregion2 = data.copy() # Drop regions without a code subregion2 = subregion2[subregion2["subregion2_code"] != "000"] subregion2.dropna(subset=["subregion1_code", "subregion2_code"], inplace=True) subregion1.dropna(subset=["subregion1_code"], inplace=True) # Compute the key from the subregion codes country["key"] = "AR" subregion1["key"] = "AR_" + subregion1["subregion1_code"] subregion2["key"] = "AR_" + subregion2["subregion1_code"] + "_" + data[ "subregion2_code"] return concat([subregion2, subregion1, country])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename appropriate columns cases = table_rename( dataframes[0], { "residencia_provincia_id": "subregion1_code", "residencia_departamento_id": "subregion2_code", "fecha_fallecimiento": "date_new_deceased", "fecha_apertura": "_date_estimate", "fecha_diagnostico": "date_new_tested", "fecha_internacion": "date_new_hospitalized", "fecha_cui_intensivo": "date_new_intensive_care", "clasificacion_resumen": "_classification", "edad": "age", "sexo": "sex", }, drop=True, ) # Get rid of all the suspected cases, since we have nothing to tally for them cases = cases[~cases["_classification"].str.lower().str. match(".*sospechoso.*")] # Confirmed cases use the label "confirmado" cases["date_new_confirmed"] = None confirmed_mask = cases["_classification"].str.lower().str.match( ".*confirmado.*") cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[confirmed_mask, "date_new_tested"] # Estimate the confirmed date when none is available cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[ confirmed_mask, "date_new_confirmed"].fillna(cases.loc[confirmed_mask, "_date_estimate"]) # Only count deaths from confirmed cases cases.loc[~confirmed_mask, "date_new_deceased"] = None # Remove unnecessary columns before converting to time series cases = cases.drop( columns=[col for col in cases.columns if col.startswith("_")]) # Clean up the subregion codes cases["subregion1_code"] = cases["subregion1_code"].apply( lambda x: numeric_code_as_string(x, 2) or "00") cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 3) or "000") # Go from individual case records to key-grouped records in time series format data = convert_cases_to_time_series( cases, ["subregion1_code", "subregion2_code"]) # Parse dates to ISO format. data["date"] = data["date"].astype(str) # Aggregate to the country level and report that separately country = (data.drop( columns=["subregion1_code", "subregion2_code"]).groupby( ["date", "age", "sex"]).sum().reset_index()) # Convert subregion1_code to the corresponding ISO code data["subregion1_code"] = data["subregion1_code"].apply( _ISO_CODE_MAP.get) # Aggregate by province and report that separately provinces = (data.drop(columns=["subregion2_code"]).groupby( ["subregion1_code", "date", "age", "sex"]).sum().reset_index()) # Drop regions without a code data = data[data["subregion2_code"] != "000"] data.dropna(subset=["subregion1_code", "subregion2_code"], inplace=True) provinces.dropna(subset=["subregion1_code"], inplace=True) # Compute the key from the subregion codes country["key"] = "AR" provinces["key"] = "AR_" + provinces["subregion1_code"] data["key"] = "AR_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] return concat([data, provinces, country])