def _process_partition(cases: DataFrame) -> DataFrame: cases = cases.copy() # Extract information about whether doses were first (partial immunization) or second (full) cases["date_new_persons_vaccinated"] = None cases["date_new_persons_fully_vaccinated"] = None first_dose_mask = cases["_dose_information"].str.strip().str.slice( 0, 1) == "1" second_dose_mask = cases["_dose_information"].str.strip().str.slice( 0, 1) == "2" cases.loc[first_dose_mask, "date_new_persons_vaccinated"] = cases.loc[ first_dose_mask, "date_new_vaccine_doses_administered"] cases.loc[second_dose_mask, "date_new_persons_fully_vaccinated"] = cases.loc[ second_dose_mask, "date_new_vaccine_doses_administered"] # Drop columns which we have no use for cases = cases[[col for col in cases.columns if not col.startswith("_")]] # Make sure our region codes are of type str cases["subregion2_code"] = cases["subregion2_code"].apply( safe_int_cast).astype(str) # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = cases["sex"].str.lower().apply({ "m": "male", "f": "female" }.get) # Convert to time series format data = convert_cases_to_time_series( cases, index_columns=["subregion1_code", "subregion2_code"]) # Convert date to ISO format data["date"] = data["date"].str.slice(0, 10) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) # Get rid of bogus records data = data.dropna(subset=["date"]) data = data[data["date"] >= "2020-01-01"] data = data[data["date"] < date_today(offset=1)] # Aggregate data by country country = aggregate_admin_level(data, ["date", "age", "sex"], "country") country["key"] = "BR" # Aggregate data by state state = (data.drop(columns=["subregion2_code"]).groupby( ["date", "subregion1_code", "age", "sex"]).sum().reset_index()) state["key"] = "BR_" + state["subregion1_code"] # We can derive the key from subregion1 + subregion2 data = data[data["subregion2_code"].notna() & (data["subregion2_code"] != "")] data["key"] = "BR_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] return concat([country, state, data])
def parse_dataframes(self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename(dataframes[0], _column_adapter, drop=True).sort_values("date") # Convert from the ITSA codes to our region codes data["subregion1_code"] = data["subregion1_code"].apply( _subregion1_code_converter) # Aggregate here since some of the codes are null (04 indicates either BZ/TN) country = aggregate_admin_level(data, ["date"], "country") country["key"] = "IT" # Match data with IT subregions data = data[data['subregion1_code'].notna()] data["country_code"] = "IT" data["subregion2_code"] = None data["locality_code"] = None return concat([country, data])
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: with open(sources[0], "r") as fd: cases = json.load(fd)["Data"] # {"ConfirmDate":"2021-01-09 00:00:00","No":"9876","Age":66,"Gender":"\u0e0a","GenderEn":"Male","Nation":"Thailand","NationEn":"Thailand","Province":"\u0e2d","ProvinceId":72,"District":"\u0e44","ProvinceEn":"Ang Thong","Detail":null,"StatQuarantine":1} cases = table_rename( DataFrame.from_records(cases), { "ConfirmDate": "date_new_confirmed", "Age": "age", "GenderEn": "sex", "ProvinceEn": "match_string", }, drop=True, ) # Convert dates to ISO format for col in cases.columns: if col.startswith("date_"): cases[col] = cases[col].str.slice(0, 10) # Parse age and sex fields cases["sex"] = cases["sex"].str.lower().apply({"male": "male", "female": "female"}.get) cases["age"] = cases["age"].fillna("age_unknown") cases["sex"] = cases["sex"].fillna("sex_unknown") # Convert to time series data data = convert_cases_to_time_series(cases, ["match_string"]) # Aggregate by country level country = aggregate_admin_level(data, ["date", "age", "sex"], "country") country["key"] = "TH" # Add country code and return data data["country_code"] = "TH" data = data[data["match_string"] != "Unknown"] return concat([country, data])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = table_rename( concat(dataframes.values()), { # "Patient Number": "", # "State Patient Number": "", "Date Announced": "date_new_confirmed", # "Estimated Onset Date": "", "Age Bracket": "age", "Gender": "sex", # "Detected City": "", "Detected District": "subregion2_name", "Detected State": "subregion1_name", # "State code": "subregion1_code", "Current Status": "_prognosis", # "Notes": "", # "Contracted from which Patient (Suspected)": "", # "Nationality": "", # "Type of transmission": "", "Status Change Date": "_change_date", # "Source_1": "", # "Source_2": "", # "Source_3": "", # "Backup Notes": "", "Num Cases": "new_confirmed", "Entry_ID": "", }, drop=True, ) # Convert dates to ISO format for col in [col for col in cases.columns if "date" in col]: cases[col] = cases[col].apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) cases["age"] = cases["age"].astype(str) cases["age"] = cases["age"].str.lower() cases["age"] = cases["age"].str.replace("\.0", "") cases["age"] = cases["age"].str.replace(r"[\d\.]+ day(s)?", "1") cases["age"] = cases["age"].str.replace(r"[\d\.]+ month(s)?", "1") cases.loc[cases["age"].str.contains("-"), "age"] = None sex_adapter = lambda x: { "M": "male", "F": "female" }.get(x, "sex_unknown") cases["sex"] = cases["sex"].str.strip() cases["sex"] = cases["sex"].apply(sex_adapter) cases["date_new_deceased"] = None deceased_mask = cases["_prognosis"] == "Deceased" cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "_change_date"] cases["date_new_hospitalized"] = None hosp_mask = cases["_prognosis"] == "Hospitalized" cases.loc[hosp_mask, "date_new_hospitalized"] = cases.loc[hosp_mask, "_change_date"] data = convert_cases_to_time_series( cases, ["subregion1_name", "subregion2_name"]) data["country_code"] = "IN" # Aggregate country level and admin level 1 country = aggregate_admin_level(data, ["date", "age", "sex"], "country") subregion1 = aggregate_admin_level(data, ["date", "age", "sex"], "subregion1") subregion1 = subregion1[ subregion1["subregion1_name"].str.lower() != "state unassigned"] # Data for admin level 2 is too noisy and there are many mismatches, so we only return # the aggregated country level and admin level 1 data return concat([country, subregion1])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename appropriate columns data = table_rename( dataframes[0], { "sexo": "sex", "grupo_etario": "age", # "jurisdiccion_residencia": "", "jurisdiccion_residencia_id": "subregion1_code", # "depto_residencia": "", "depto_residencia_id": "subregion2_code", # "jurisdiccion_aplicacion": "", # "jurisdiccion_aplicacion_id": "", # "depto_aplicacion": "", # "depto_aplicacion_id": "", "fecha_aplicacion": "date", "vacuna": "_manufacturer", # "condicion_aplicacion": "", "orden_dosis": "_dose_number", # "lote_vacuna": "", }, drop=True, ) # Parse dates to ISO format. data["date"] = data["date"].astype(str) # Parse sex label into proper name data["sex"] = data["sex"].apply({"M": "male", "F": "female"}.get) # Parse the dose number assuming all vaccines have a 2-dose schedule data["new_persons_vaccinated"] = data["_dose_number"].apply( lambda x: 1 if x == 1 else 0) data["new_persons_fully_vaccinated"] = data["_dose_number"].apply( lambda x: 1 if x == 2 else 0) data["new_vaccine_doses_administered"] = ( data["new_persons_vaccinated"] + data["new_persons_fully_vaccinated"]) # Add a column for each vaccine manufacturer for manufacturer in data["_manufacturer"].unique(): mask = data["_manufacturer"] == manufacturer brand_name = manufacturer.lower() cols = [f"new_persons_{mod}vaccinated" for mod in ["", "fully_"]] cols += [f"new_vaccine_doses_administered"] for col in cols: new_col = f"{col}_{brand_name}" data[new_col] = None data.loc[mask, new_col] = data.loc[mask, col] # Clean up the subregion codes data["subregion1_code"] = data["subregion1_code"].apply( lambda x: numeric_code_as_string(x, 2) or "00") data["subregion2_code"] = data["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 3) or "000") # Convert subregion1_code to the corresponding ISO code data["subregion1_code"] = data["subregion1_code"].apply( _ISO_CODE_MAP.get) # Group by indexable columns idx_cols = ["date", "subregion1_code", "subregion2_code", "sex", "age"] data = data.groupby(idx_cols).sum().reset_index() # Aggregate country level and admin level 1 country = aggregate_admin_level(data, ["date", "age", "sex"], "country") subregion1 = aggregate_admin_level(data, ["date", "age", "sex"], "subregion1") subregion2 = data.copy() # Drop regions without a code subregion2 = subregion2[subregion2["subregion2_code"] != "000"] subregion2.dropna(subset=["subregion1_code", "subregion2_code"], inplace=True) subregion1.dropna(subset=["subregion1_code"], inplace=True) # Compute the key from the subregion codes country["key"] = "AR" subregion1["key"] = "AR_" + subregion1["subregion1_code"] subregion2["key"] = "AR_" + subregion2["subregion1_code"] + "_" + data[ "subregion2_code"] return concat([subregion2, subregion1, country])