def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: records = [] sex_buckets = {"gender_male": "male", "gender_female": "female"} age_buckets = { col: col.replace("age_", "") for col in dataframes[0].columns if col.startswith("age_") } for _, row in dataframes[0].iterrows(): for age_col, age_bucket in age_buckets.items(): records.append({ "key": "ZA", "date": datetime_isoformat(row.date, "%d-%m-%Y"), "age": None if age_bucket == "unknown" else age_bucket, "total_deceased": row[age_col], }) for sex_col, sex_bucket in sex_buckets.items(): records.append({ "key": "ZA", "date": datetime_isoformat(row.date, "%d-%m-%Y"), "sex": sex_bucket, "total_deceased": row[sex_col], }) return DataFrame.from_records(records)
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_rename( dataframes[0], { "submission_date": "date", "state": "subregion1_code", "tot_cases": "total_confirmed", # "conf_cases": "total_confirmed", # "prob_cases": "", "new_case": "new_confirmed", # "pnew_case": "", "tot_death": "total_deceased", # "conf_death": "", # "prob_death": "", "new_death": "new_deceased", # "pnew_death": "", # "created_at": "", # "consent_cases": "", # "consent_deaths": "", }, drop=True, ) data["key"] = "US_" + data["subregion1_code"] data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%m/%d/%Y")) # A few "states" are considered independent territories by our dataset or need correction data.loc[data["subregion1_code"] == "PW", "key"] = "PW" data.loc[data["subregion1_code"] == "FSM", "key"] = "FM" data.loc[data["subregion1_code"] == "RMI", "key"] = "MH" data.loc[data["subregion1_code"] == "NYC", "key"] = "US_NY_NYC" return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: with open(sources[0], "r") as fd: data = json.load(fd)["Data"] # "Date":"01\/01\/2020","NewConfirmed":0,"NewRecovered":0,"NewHospitalized":0,"NewDeaths":0,"Confirmed":0,"Recovered":0,"Hospitalized":0,"Deaths":0 data = table_rename( DataFrame.from_records(data), { "Date": "date", "NewConfirmed": "new_confirmed", "NewRecovered": "new_recovered", "NewHospitalized": "new_hospitalized", "NewDeaths": "new_deceased", "Confirmed": "total__confirmed", "Recovered": "total__recovered", "Hospitalized": "total__hospitalized", "Deaths": "total__deceased", }, drop=True, remove_regex=r"[^0-9a-z\s]", ) # Format date as ISO date data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%m/%d/%Y")) # Add key and return data data["key"] = "TH" return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: url_tpl = sources[0] metadata = aux["metadata"] metadata = metadata[metadata["country_code"] == "FR"] fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv") fr_iso_map = { iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"]) } fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna() regions_iter = fr_codes["subregion1_code"].unique() deps_iter = (record for _, record in fr_codes.iterrows()) if parse_opts.get("country"): data = _get_country(url_tpl) else: get_region_func = partial(_get_region, url_tpl, fr_iso_map) regions = concat(list(thread_map(get_region_func, regions_iter))) get_department_func = partial(_get_department, url_tpl) departments = concat( list( thread_map(get_department_func, deps_iter, total=len(fr_codes)))) data = concat([regions, departments]) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: deceased = table_rename(dataframes["deceased"], {"FECHA / CCAA": "date"}) deceased = pivot_table(deceased.set_index("date"), value_name="new_deceased", pivot_name="match_string") # Convert dates to ISO format deceased["date"] = deceased["date"].apply(lambda x: str(x)[:10]) deceased["date"] = deceased["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) # Add the country code to all records and declare matching as subregion1 deceased["country_code"] = "ES" deceased["subregion2_code"] = None deceased["locality_code"] = None # Country level is declared as "espana" deceased["key"] = None deceased.loc[deceased["match_string"] == "espana", "key"] = "ES" # Output the results return deceased.dropna(subset=["date"])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # The headers are a bit funny-looking, so we must manually manipulate them first data = dataframes[0] data.columns = [ col.split("|")[0].split("~")[0] for col in data.iloc[0] ] data = data.iloc[1:] data = table_rename( data, { "Date": "date", "Nombre de personnes en soins intensifs": "current_intensive_care", "Nombre cumulé de décès": "total_deceased", "Nombre de personnes testées COVID+": "new_tested", }, drop=True, ) # Get date in ISO format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # Only country-level data is provided data["key"] = "LU" # Output the results return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns tables = [] dataframes[0].columns = [ fuzzy_text(col, remove_regex=r"[^0-9a-z\s_]", remove_spaces=False) for col in dataframes[0].columns ] for keyword, value_column in [ ("confirmed", "total_confirmed"), ("death", "total_deceased"), ("recover", "total_recovered"), ]: data = dataframes[0][ ["_name"] + [col for col in dataframes[0].columns if keyword in col]] data.columns = [col.split(" upto ", 2)[-1] for col in data.columns] data = data.set_index("_name")[data.columns[1:]] data = pivot_table_date_columns(data, pivot_name="date", value_name=value_column) data.date = data.date.apply( lambda x: datetime_isoformat(f"{x}-2020", "%d %B-%Y")) data = data.reset_index().rename(columns={"index": "match_string"}) tables.append(data) # Aggregate all tables together data = concat(tables) # Make sure all records have the country code data["country_code"] = "BD" # Output the results return data
def parse_dataframes( self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = dataframes[0].rename( columns={ "日付": "date", "都道府県名": "match_string", "患者数": "confirmed", "入院中": "hospitalized", "退院者": "recovered", "死亡者": "deceased", } ) # Convert date to ISO format data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y%m%d")) # Add the country code to all records data["country_code"] = "JP" # Keep only columns we can process data = data[["date", "match_string", "confirmed", "hospitalized", "recovered", "deceased"]] # Aggregate the region-level data data = grouped_cumsum(data, ["country_code", "match_string", "date"]) # Aggregate the country-level data data_country = data.groupby("date").sum().reset_index() data_country["key"] = "JP" # Output the results return concat([data_country, data])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_merge([ melt(dataframes[name], id_vars=["Date"], var_name="match_string", value_name=value) for name, value in [( "confirmed", "new_confirmed"), ("deceased", "total_deceased")] ]) data["country_code"] = "JP" # Get date in ISO format data = data.rename(columns={"Date": "date"}) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y/%m/%d")) # Country-level uses the label "ALL" country_mask = data["match_string"] == "ALL" country = data.loc[country_mask] data = data.loc[~country_mask] country["key"] = "JP" # Output the results return concat([country, data])
def parse(self, sources: Dict[Any, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: url_tpl = sources[0] metadata = aux["metadata"] metadata = metadata[metadata["country_code"] == "FR"] fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv") fr_iso_map = {iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])} fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna() regions_iter = fr_codes["subregion1_code"].unique() deps_iter = [record for _, record in fr_codes.iterrows()] column_adapter = { "key": "key", "date": "date", "testsRealisesDetails": "_breakdown_tested", "testsPositifsDetails": "_breakdown_confirmed", } # Get country level data country = _get_country(url_tpl, column_adapter) # Get region level data get_region_func = partial(_get_region, url_tpl, column_adapter, fr_iso_map) regions = concat(list(thread_map(get_region_func, regions_iter))) # Get department level data get_department_func = partial(_get_department, url_tpl, column_adapter) departments = concat(list(thread_map(get_department_func, deps_iter))) data = concat([country, regions, departments]) data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) data["_breakdown_tested"].fillna("", inplace=True) data["_breakdown_confirmed"].fillna("", inplace=True) records: Dict[str, List] = {"confirmed": [], "tested": []} for key, row in data.set_index("key").iterrows(): for statistic in records.keys(): if row[f"_breakdown_{statistic}"] != "": for item in row[f"_breakdown_{statistic}"]: records[statistic].append( { "key": key, "date": row["date"], "age": item["age"], "sex": item.get("sexe"), f"new_{statistic}": item["value"], } ) df1 = DataFrame.from_records(records["tested"]) df2 = DataFrame.from_records(records["confirmed"]) data = df1.merge(df2, how="outer") data = data[~data["age"].isin(["0", "A", "B", "C", "D", "E"])] data["age"] = data["age"].apply(lambda x: age_group(safe_int_cast(x))) sex_adapter = lambda x: {"h": "male", "f": "female"}.get(x, "sex_unknown") data["sex"] = data["sex"].apply(sex_adapter) return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = (dataframes[0].rename( columns={ "Date": "date", "Province": "match_string", "Confirmed Cases": "total_confirmed", }).drop([0, 1])) # Spreadsheet has the typo heatlh data = data.drop(axis=1, columns=[ "Number of heatlh structures", "Affected", "Source", "Probable cases" ]) # Data source sometimes uses different hypenation from src/data/iso_3166_2_codes.csv data["match_string"].replace({"Haut Katanga": "Haut-Katanga"}, inplace=True) data.date = data.date.apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) data["total_confirmed"] = (data["total_confirmed"].fillna(0).astype( {"total_confirmed": "int64"})) # Make sure all records have the country code data["country_code"] = "CD" # Output the results return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = dataframes[0] data["date"] = data.REPORT_DATE.apply(lambda x: datetime_isoformat(x, "%Y-%m-%d")) # Add level1 keys subregion1s = country_subregion1s(aux["metadata"], "AU") data = table_merge([data, subregion1s], left_on="CODE", right_on="subregion1_code", how="left") # Country-level record has CODE AUS country_mask = data["CODE"] == "AUS" data.loc[country_mask, "key"] = "AU" # Only keep country and subregion1 rows data = data[data.key != None] data = table_rename( data, { "date": "date", "key": "key", "VACC_DOSE_CNT": "total_vaccine_doses_administered", "VACC_PEOPLE_CNT": "total_persons_fully_vaccinated", }, drop=True) # remove rows without vaccination data data.dropna(subset=["total_vaccine_doses_administered", "total_persons_fully_vaccinated"], how="all", inplace=True) # based on the assumption two doses = fully vaccinated(since Australia is using Pfizer and AZ) data["total_persons_vaccinated"] = estimate_total_persons_vaccinated(data) return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename( dataframes[0], { "TipusCasData": "date", "SexeCodi": "sex", # "SexeDescripcio": "sex", "EdatRang": "age", "TipusCasDescripcio": "_case_type", "NumCasos": "new_confirmed", }, drop=True, ) # Remove "suspect" cases data = data[data["_case_type"] != "Sospitós"].drop( columns=["_case_type"]) # Derive key from subregion code data["key"] = "ES_CT" # Parse age, sex, date and numeric values sex_adapter = {"0": "male", "1": "female"} data["age"] = data["age"].str.replace("90\+", "90-") data["sex"] = data["sex"].apply( lambda x: sex_adapter.get(x, "sex_unknown")) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) data["new_confirmed"] = data["new_confirmed"].apply(safe_int_cast) return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_multimerge( [ Covid19ZaCumulativeDataSource._parse_variable(df, name) for df, name in zip( dataframes, [ "total_confirmed", "total_deceased", "total_recovered", "total_tested" ], ) ], how="outer", ) # Convert date to ISO format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%d-%m-%Y")) # Country-level records should have "total" region name country_mask = data["subregion1_code"] == "total" data.loc[country_mask, "key"] = "ZA" # All other records can provide their own key directly data.loc[~country_mask, "key"] = "ZA_" + data.subregion1_code # Output the results return data
def _process_partition(cases: DataFrame) -> DataFrame: cases = cases.copy() # Extract information about whether doses were first (partial immunization) or second (full) cases["date_new_persons_vaccinated"] = None cases["date_new_persons_fully_vaccinated"] = None first_dose_mask = cases["_dose_information"].str.strip().str.slice( 0, 1) == "1" second_dose_mask = cases["_dose_information"].str.strip().str.slice( 0, 1) == "2" cases.loc[first_dose_mask, "date_new_persons_vaccinated"] = cases.loc[ first_dose_mask, "date_new_vaccine_doses_administered"] cases.loc[second_dose_mask, "date_new_persons_fully_vaccinated"] = cases.loc[ second_dose_mask, "date_new_vaccine_doses_administered"] # Drop columns which we have no use for cases = cases[[col for col in cases.columns if not col.startswith("_")]] # Make sure our region codes are of type str cases["subregion2_code"] = cases["subregion2_code"].apply( safe_int_cast).astype(str) # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = cases["sex"].str.lower().apply({ "m": "male", "f": "female" }.get) # Convert to time series format data = convert_cases_to_time_series( cases, index_columns=["subregion1_code", "subregion2_code"]) # Convert date to ISO format data["date"] = data["date"].str.slice(0, 10) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) # Get rid of bogus records data = data.dropna(subset=["date"]) data = data[data["date"] >= "2020-01-01"] data = data[data["date"] < date_today(offset=1)] # Aggregate data by country country = aggregate_admin_level(data, ["date", "age", "sex"], "country") country["key"] = "BR" # Aggregate data by state state = (data.drop(columns=["subregion2_code"]).groupby( ["date", "subregion1_code", "age", "sex"]).sum().reset_index()) state["key"] = "BR_" + state["subregion1_code"] # We can derive the key from subregion1 + subregion2 data = data[data["subregion2_code"].notna() & (data["subregion2_code"] != "")] data["key"] = "BR_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] return concat([country, state, data])
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = { "date": "date", "areaCode": "areaCode", "newCasesBySpecimenDate": "newCasesBySpecimenDate", "cumCasesBySpecimenDate": "cumCasesBySpecimenDate", } api = Cov19API(filters=["areaType=utla"], structure=cases) data = api.get_dataframe() data.areaCode = data.areaCode.apply(_apply_area_code_map) data = data.groupby(["date", "areaCode"], as_index=False).sum() data = table_rename( data, { "areaCode": "subregion2_code", "newCasesBySpecimenDate": "new_confirmed", "cumCasesBySpecimenDate": "total_confirmed", "date": "date", }, drop=True, ) data.date = data.date.apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = dataframes[0].rename( columns={ "date": "date", "state": "subregion1_code", "positive": "confirmed", "death": "deceased", "total": "tested", "recovered": "recovered", }) # Convert date to ISO format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y%m%d")) # Keep only columns we can process data['key'] = 'US_' + data['subregion1_code'] data = data[[ "date", 'key', "confirmed", "deceased", "tested", "recovered" ]] # Output the results return grouped_diff(data, ["key", "date"])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename appropriate columns col = parse_opts["column_name"] cases = table_rename(dataframes[0], _column_adapter) cases = cases.rename(columns={"date": f"date_{col}"}) cases = _parse_region_codes(cases).dropna(subset=[f"date_{col}"]) # Rename the sex values cases["sex"] = cases["sex"].apply({"M": "male", "Z": "female"}.get) # Go from individual case records to key-grouped records in a flat table data = convert_cases_to_time_series( cases, index_columns=["subregion1_code", "subregion2_code"]) # Make sure the region codes are strings before parsing them data["subregion1_code"] = data["subregion1_code"].astype(str) data["subregion2_code"] = data["subregion2_code"].astype(str) # Aggregate L2 + L3 data data = _aggregate_regions(data, ["date", "subregion1_code", "age", "sex"]) # Remove bogus values data = data[data["key"] != "CZ_99"] data = data[data["key"] != "CZ_99_99Y"] # Convert all dates to ISO format data["date"] = ( data["date"].astype(str).apply(lambda x: datetime_isoformat( x, "%d.%m.%Y" if "." in x else "%Y-%m-%d"))) return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_rename( dataframes["counties"], { "Date": "date", "County": "match_string", "Count": "total_confirmed", "Deaths": "total_deceased", }, ) # Convert date to ISO format data["date"] = data["date"].astype(str).apply(lambda x: datetime_isoformat(x, "%m/%d/%Y")) # Drop bogus values data = data[data["match_string"] != "Unknown"] # Dukes and Nantucket are separate counties but reported as one, so drop them from the data data = data[data["match_string"] != "Dukes and Nantucket"] data["country_code"] = "US" data["subregion1_code"] = "MA" return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename( dataframes[0], parse_opts.get( "column_adapter", { "discharged_cumulative": "total_discharged", "hospitalized_current": "current_hospitalized", "number hospitalised": "current_hospitalized", "hospitalized_cumulative": "total_hospitalized", "icu_current": "current_intensive_care", "number in icu": "current_intensive_care", "icu_cumulative": "cumulative_intensive_care", "ventilator_current": "current_ventilator", "ventilator_cumulative": "cumulative_ventilator", "new hospital admissions": "new_hospitalized", "new intensive care admissions": "new_intensive_care", }, ), ) # Add key and parse date in ISO format data["key"] = parse_opts.get("key") data["date"] = data[parse_opts.get("date_column", "date")].astype(str) date_format = parse_opts.get("date_format", "%Y-%m-%d") data.date = data.date.apply( lambda x: datetime_isoformat(x, date_format)) return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = (dataframes[0].rename( columns={ "prname": "subregion1_name", "numconf": "total_confirmed", "numtoday": "new_confirmed", "numdeaths": "total_deceased", "numtested": "total_tested", "numrecover": "total_recovered", }).drop(columns=["prnameFR"])) # Convert date to ISO format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%d-%m-%Y")) # Make sure all records have the country code and match subregion1 only data["country_code"] = "CA" data["subregion2_code"] = None # Country-level records should have null region name country_mask = data["subregion1_name"] == "Canada" data.loc[country_mask, "subregion1_name"] = None # Remove bogus data data = data[~data["subregion1_name"].apply(lambda x: "traveller" in (x or "").lower())] # Output the results return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_merge( [ dataframes["confirmed_deceased_recovered"].rename( columns=COMMON_COLUMNS, ), dataframes["tested"].rename(columns={ "TestGesamt": "total_tested", "MeldeDatum": "Time" }) ], how="outer", ) # Convert date to ISO format data["date"] = data["Time"].apply( lambda x: datetime_isoformat(x, "%d.%m.%Y %H:%M:%S")) # Create the key from the state ID data["key"] = data["BundeslandID"].apply(lambda x: f"AT_{x}") data.loc[data["key"] == "AT_10", "key"] = "AT" # Output the results return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: # Rename the appropriate columns data = ( dataframes[0] .rename( columns={ "Location": "match_string", "Confirmed": "total_confirmed", "Deaths": "total_deceased", "Recoveries": "total_recovered", "Date": "date", } ) .drop(columns=["Active"]) ) # Convert date to ISO format data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%m/%d/%Y")) # The first row is metadata info about column names - discard it data = data[data.match_string != "#loc+name"] # Convert string numbers to int # Parse integers for column in ("total_confirmed", "total_deceased", "total_recovered"): data[column] = data[column].apply(lambda x: safe_int_cast(str(x).replace(",", ""))) # Make sure all records have the country code data["country_code"] = "LY" # Output the results return data
def parse_dataframes( self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = dataframes[0] column_tokens = ["confirmed_", "deaths_", "recovered_"] data = data[[col for col in data.columns if any(token in col for token in column_tokens)]] data = data.drop( columns=["cases_confirmed_new", "cases_unconfirmed_new", "deaths_new", "recovered_new"] ) data["date"] = dataframes[0].date.apply(lambda x: datetime_isoformat(x, "%d-%m-%Y")) subsets = [] for token in column_tokens: df = data[["date"] + [col for col in data.columns if token in col]] df = pivot_table(df.set_index("date"), pivot_name="match_string") df.match_string = df.match_string.apply(lambda x: x.split("_", 2)[1]) df = df.rename(columns={"value": token.split("_")[0]}) subsets.append(df) data = subsets[0] for df in subsets[1:]: data = data.merge(df, how="outer") data = data.rename(columns={"deaths": "deceased"}) data = data[data.match_string != "unconfirmed"] data = grouped_diff(data, ["match_string", "date"]) data["country_code"] = "PT" return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: hospitalizations = dataframes[0] icu = table_rename( hospitalizations.loc[hospitalizations["DPHCategory"] == "ICU"], { "reportDate": "date", "PatientCount": "current_intensive_care" }, drop=True, ) hosp = table_rename( hospitalizations.loc[hospitalizations["DPHCategory"] == "Med/Surg"], { "reportDate": "date", "PatientCount": "current_hospitalized" }, drop=True, ) data = icu.merge(hosp, on="date") data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y/%m/%d")) data["key"] = "US_CA_SFO" return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_multimerge( [ table_rename(dataframes["confirmed"], _column_adapter, drop=True), table_rename(dataframes["deceased"], _column_adapter, drop=True), ], how="outer", ) # Province names are sometimes codes (but not always compliant with ISO codes) data["subregion1_code"] = data["subregion1_name"].apply(_province_map.get) data.drop(columns=["subregion1_name"], inplace=True) # Convert date to ISO format data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%d-%m-%Y")) # Aggregate subregion1 level l1_index = ["date", "subregion1_code"] l1 = data.drop(columns=["match_string"]).groupby(l1_index).sum().reset_index() # Make sure all records have the country code and subregion2_name l1["country_code"] = "CA" l1["subregion2_name"] = None data["country_code"] = "CA" data["subregion2_name"] = "" # Remove bogus data data = data[data["match_string"] != "Not Reported"] # Output the results return concat([l1, data])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = dataframes[0] # Get all the states states = list(data.columns.difference(["Status", "Date"])) # Flatten the table data = melt(data, id_vars=["Date", "Status"], value_vars=states, var_name="subregion1_code") # Convert numeric fields to integers data["value"] = data["value"].apply(safe_int_cast) # Pivot on Status to get flattened confirmed, deceased, recovered numbers data = data.pivot_table("value", ["Date", "subregion1_code"], "Status") data.reset_index(drop=False, inplace=True) data = data.reindex( ["Date", "subregion1_code", "Confirmed", "Deceased", "Recovered"], axis=1) data = data.rename( columns={ "Confirmed": "new_confirmed", "Deceased": "new_deceased", "Recovered": "new_recovered", "Date": "date", }) # No data is recorded against IN_DD, it is now a district of IN_DN data = data[data.subregion1_code != "DD"] data.date = data.date.apply( lambda x: datetime_isoformat(x, "%d-%b-%y")) data["key"] = "IN_" + data["subregion1_code"] return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: column_map = { "date": "date", "state": "subregion1_code", "positive": "total_confirmed", "death": "total_deceased", "total": "total_tested", "recovered": "total_recovered", "hospitalizedCurrently": "current_hospitalized", "hospitalizedCumulative": "total_hospitalized", "inIcuCurrently": "current_intensive_care", "inIcuCumulative": "total_intensive_care", "onVentilatorCurrently": "current_ventilator", "onVentilatorCumulative": "total_ventilator", } # Rename the appropriate columns data = dataframes[0].rename(columns=column_map) # Convert date to ISO format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y%m%d")) # Keep only columns we can process data["key"] = "US_" + data["subregion1_code"] data = data[["key"] + list(column_map.values())].drop( columns=["subregion1_code"]) # Output the results return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = dataframes[0] # Get all the states states = list(data.columns.difference(["Status", "Date"])) # Flatten the table data = melt(data, id_vars=["Date", "Status"], value_vars=states, var_name="subregion1_code") # Pivot on Status to get flattened confirmed, deceased, recovered numbers data = data.pivot_table("value", ["Date", "subregion1_code"], "Status") data.reset_index(drop=False, inplace=True) data = data.reindex( ["Date", "subregion1_code", "Confirmed", "Deceased", "Recovered"], axis=1 ) data = data.rename( columns={ "Confirmed": "new_confirmed", "Deaths": "new_deceased", "Recovered": "new_recovered", "Date": "date", } ) data.date = data.date.apply(lambda x: datetime_isoformat(x, "%d-%b-%y")) data["country_code"] = "IN" return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = table_rename(dataframes[0], _srag_column_adapter, drop=True) covid_mask = cases["_classification"] == 5 valid_mask = cases["_prognosis"].notna() & cases["_prognosis"] != 9 cases = cases[covid_mask & valid_mask] # Record the date of death cases["date_new_deceased"] = None deceased_mask = cases["_prognosis"] == 2 cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "_date_prognosis"] # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = cases["sex"].apply({"M": "male", "F": "female"}.get) # Convert all dates to ISO format for col in filter(lambda x: x.startswith("date"), cases.columns): cases[col] = cases[col].apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # Parse subregion codes cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 5)) # Convert to time series format data = convert_cases_to_time_series(cases, index_columns=["subregion2_code"]) data["country_code"] = "BR" # Get rid of bogus records data = data.dropna(subset=["date"]) data = data[data["date"] >= "2020-01-01"] data = data[data["date"] < date_today(offset=1)] # Aggregate by country level country = (data.drop(columns=["subregion2_code"]).groupby( ["date", "age", "sex"]).sum().reset_index()) country["key"] = "BR" # Aggregate by state level data["subregion1_code"] = data["subregion2_code"].apply( lambda x: _IBGE_STATES.get(safe_int_cast(x[:2]))) state = (data.drop(columns=["subregion2_code"]).dropna( subset=["subregion1_code"]).groupby( ["date", "subregion1_code", "age", "sex"]).sum().reset_index()) state["key"] = "BR_" + state["subregion1_code"] # Derive the key from subregion codes data = data[data["subregion2_code"].notna()] data["key"] = "BR_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] return concat([country, state, data])