def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # County data is already in the format we want data = (dataframes[0].rename( columns={ "Date": "date", "Country": "subregion1_name", "AreaCode": "subregion2_code", "TotalCases": "confirmed", }).drop(columns=["Area"]).dropna(subset=["subregion2_code"])) # Add subregion1 code to the data gb_meta = aux["metadata"] gb_meta = gb_meta[gb_meta["country_code"] == "GB"] gb_meta = gb_meta[gb_meta["subregion2_code"].isna()] country_map = { idx: code for idx, code in gb_meta.set_index("subregion1_name") ["subregion1_code"].iteritems() } data["subregion1_code"] = data["subregion1_name"].apply( lambda x: country_map[x]) # Manually build the key rather than doing automated merge for performance reasons data["key"] = "GB_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] # Now that we have the key, we don't need any other non-value columns data = data[["date", "key", "confirmed"]] data["confirmed"] = data["confirmed"].apply(safe_int_cast).astype( "Int64") data = grouped_diff(data, ["key", "date"]) return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = dataframes[0].transpose() # Transform the data from non-tabulated format to record format records = [] for idx, row in data.iterrows(): for code in data.columns: subset = row[code] record = { "date": idx.date().isoformat(), "country_code": "AU", "subregion1_code": code, "confirmed": subset[0], } if len(subset) > 1: record["deceased"] = subset[1] if len(subset) > 2: record["recovered"] = subset[2] if len(subset) > 3: record["tested"] = subset[3] records.append(record) data = DataFrame.from_records(records) return grouped_diff(data, ["country_code", "subregion1_code", "date"])
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = dataframes[0].rename(columns={"time_iso8601": "date"}) # Convert dates to ISO format data["date"] = data["date"].apply( lambda x: datetime.datetime.fromisoformat(x).date().isoformat()) # Get a list of all regions regions = unique( [col[3:5] for col in data.columns if col.startswith("DE-")]) # Transform the data from non-tabulated format to our record format records = [] for idx, row in data.iterrows(): record = {"date": row["date"]} for region_code in regions: records.append({ "subregion1_code": region_code, "confirmed": row["DE-%s_cases" % region_code], "deceased": row["DE-%s_deaths" % region_code], **record, }) data = DataFrame.from_records(records) # Ensure we only take one record from the table data = data.groupby(["date", "subregion1_code"]).last().reset_index() # Output the results data = grouped_diff(data, ["subregion1_code", "date"]) data["country_code"] = "DE" return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = dataframes[0].rename( columns={ "date": "date", "state": "subregion1_code", "positive": "confirmed", "death": "deceased", "total": "tested", "recovered": "recovered", }) # Convert date to ISO format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y%m%d")) # Keep only columns we can process data['key'] = 'US_' + data['subregion1_code'] data = data[[ "date", 'key', "confirmed", "deceased", "tested", "recovered" ]] # Output the results return grouped_diff(data, ["key", "date"])
def parse_dataframes( self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = dataframes[0] column_tokens = ["confirmed_", "deaths_", "recovered_"] data = data[[col for col in data.columns if any(token in col for token in column_tokens)]] data = data.drop( columns=["cases_confirmed_new", "cases_unconfirmed_new", "deaths_new", "recovered_new"] ) data["date"] = dataframes[0].date.apply(lambda x: datetime_isoformat(x, "%d-%m-%Y")) subsets = [] for token in column_tokens: df = data[["date"] + [col for col in data.columns if token in col]] df = pivot_table(df.set_index("date"), pivot_name="match_string") df.match_string = df.match_string.apply(lambda x: x.split("_", 2)[1]) df = df.rename(columns={"value": token.split("_")[0]}) subsets.append(df) data = subsets[0] for df in subsets[1:]: data = data.merge(df, how="outer") data = data.rename(columns={"deaths": "deceased"}) data = data[data.match_string != "unconfirmed"] data = grouped_diff(data, ["match_string", "date"]) data["country_code"] = "PT" return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = dataframes[0].rename( columns={ "date": "date", "state": "subregion1_name", "cases": "confirmed", "deaths": "deceased", }) # Add state code to the data us_meta = aux["metadata"] us_meta = us_meta[us_meta["country_code"] == "US"] us_meta = us_meta[us_meta["subregion2_code"].isna()] state_map = { idx: code for idx, code in us_meta.set_index("subregion1_name") ["subregion1_code"].iteritems() } data["subregion1_code"] = data["subregion1_name"].apply( lambda x: state_map[x]) # Manually build the key rather than doing automated merge for performance reasons data["key"] = "US_" + data["subregion1_code"] # Now that we have the key, we don't need any other non-value columns data = data[["date", "key", "confirmed", "deceased"]] data = grouped_diff(data, ["key", "date"]) return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts): data = dataframes[0].rename( columns={ "discharged_cumulative": "total_discharged", "hospitalized_current": "current_hospitalized", "hospitalized_cumulative": "total_hospitalized", "icu_current": "current_intensive_care", "icu_cumulative": "cumulative_intensive_care", "ventilator_current": "current_ventilator", "ventilator_cumulative": "cumulative_ventilator", }) # Add key and parse date in ISO format data["key"] = parse_opts["key"] data.date = data.date.astype(str) # Determine if we need to compute daily counts cum_cols = [col for col in data.columns if "cumulative" in col] if cum_cols: skip_cols = [col for col in data.columns if col not in cum_cols] data = grouped_diff(data, ["key", "date"], skip=skip_cols) return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Aggregate indicator time series data into relational format records = [] for idx, rows in dataframes[0].groupby(["Date", "Country"]): records.append({ "date": idx[0], "subregion1_name": idx[1], **{ record.loc["Indicator"]: record.loc["Value"] for _, record in rows.iterrows() }, }) data = DataFrame.from_records(records).rename(columns={ "ConfirmedCases": "confirmed", "Deaths": "deceased", "Tests": "tested" }) for col in ("confirmed", "deceased", "tested"): data[col] = data[col].apply(safe_int_cast).astype("Int64") data = grouped_diff(data, ["subregion1_name", "date"]) data.loc[data["subregion1_name"] == "UK", "subregion1_name"] = None data["subregion2_code"] = None data["country_code"] = "GB" return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = dataframes[0].rename(columns=_column_map) # Parse date into a datetime object data["date"] = data["date"].apply( lambda date: datetime.fromisoformat(date).date()) # Convert dates to ISO format data["date"] = data["date"].apply(lambda date: date.isoformat()) # Keep only data we can process data = data[[ col for col in data.columns if col in _column_map.values() ]] # Compute the daily counts key_columns = ["match_string", "date"] skip_columns = [ "new_confirmed", "total_confirmed", "current_intensive_care", "current_hospitalized", ] data = grouped_diff(data, key_columns, skip=skip_columns) # Make sure all records have the country code data["country_code"] = "IT" # Output the results return data
def parse_dataframes( self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = ( dataframes[0] .rename( columns={ "ncumul_tested": "tested", "ncumul_conf": "confirmed", "ncumul_deceased": "deceased", "ncumul_hosp": "hospitalized", "ncumul_ICU": "intensive_care", "ncumul_vent": "ventilator", "ncumul_released": "recovered", "abbreviation_canton_and_fl": "subregion1_code", } ) .drop(columns=["time", "source"]) ) # TODO: Match FL subdivision (not a canton?) data = data[data.subregion1_code != "FL"] data = grouped_diff(data, ["subregion1_code", "date"]) data["country_code"] = "CH" return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = dataframes[0] # Adjust 7 hour difference between China's GMT+8 and GMT+1 data["date"] = data["updateTime"].apply( lambda date: timezone_adjust(date, 7)) # Rename the appropriate columns data = data.rename( columns={ "countryEnglishName": "country_name", "provinceEnglishName": "match_string", "province_confirmedCount": "confirmed", "province_deadCount": "deceased", "province_curedCount": "recovered", }) # Filter specific country data only data = data[data["country_name"] == parse_opts["country_name"]] # This is time series data, get only the last snapshot of each day data = (data.sort_values("updateTime").groupby( ["date", "country_name", "match_string"]).last().reset_index()) keep_columns = [ "date", "country_name", "match_string", "confirmed", "deceased", "recovered", ] return grouped_diff(data[keep_columns], ["country_name", "match_string", "date"])
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = dataframes[0].rename(columns={ "Fecha": "date" }).set_index("date") deceased_columns = [col for col in data.columns if col.endswith("_D")] confirmed_columns = [col[:-2] for col in deceased_columns] deceased = data[deceased_columns] confirmed = data[confirmed_columns] deceased.columns = confirmed.columns deceased = pivot_table( deceased, pivot_name="subregion1_code").rename(columns={"value": "deceased"}) confirmed = pivot_table(confirmed, pivot_name="subregion1_code").rename( columns={"value": "confirmed"}) data = confirmed.merge(deceased).sort_values( ["date", "subregion1_code"]) # Output the results data = grouped_diff(data, ["subregion1_code", "date"]) data["country_code"] = "MX" return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = (dataframes[0].rename( columns={ "prname": "subregion1_name", "numconf": "confirmed", "numdeaths": "deceased", "numtested": "tested", "numrecover": "recovered", }).drop(columns=["prnameFR"])) # Convert date to ISO format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%d-%m-%Y")) # Compute the daily counts data = grouped_diff(data, ["subregion1_name", "date"]) # Make sure all records have the country code data["country_code"] = "CA" # Country-level records should have null region name country_mask = data["subregion1_name"] == "Canada" data.loc[country_mask, "subregion1_name"] = None # Remove bogus data data = data[~data["subregion1_name"].apply(lambda x: "traveller" in (x or "").lower())] # Output the results return data
def parse_dataframes( self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: join_keys = ["fecha", "CCAA"] join_opts = {"on": join_keys, "how": "outer"} data = dataframes[0] data = merge(data, dataframes[1], suffixes=("confirmed", "deceased"), **join_opts) data = merge(data, dataframes[2], suffixes=("", ""), **join_opts) data["country_code"] = "ES" data = data.rename( columns={ "fecha": "date", "CCAA": "match_string", "totalconfirmed": "confirmed", "totaldeceased": "deceased", "total": "hospitalized", } ).sort_values(["match_string", "date"]) # Keep only the columns we can process data = data[["date", "match_string", "confirmed", "deceased", "hospitalized"]] # Compute the diff for each day data = grouped_diff(data, keys=["match_string", "date"]) # Add a country code column to all records data["country_code"] = "ES" return data
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = read_file( sources[0], error_bad_lines=False, encoding="ISO-8859-1", sep=";" ).rename( columns={ "Date": "date", "Nombre de personnes en soins normaux": "current_hospitalized", "Nombre de personnes en soins intensifs (sans patients du Grand Est)": "current_intensive_care", "Nombre de décès - cumulé (sans patients du Grand Est)": "deceased", "Total patients COVID ayant quitté l'hôpital (hospitalisations stationnaires, données brutes)": "recovered", "Nombre de nouvelles personnes testées COVID+ par jour ": "tested", }) # Get date in ISO format data.date = data.date.apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # Keep only columns we can provess data = data[[ "date", "current_hospitalized", "current_intensive_care", "deceased", "recovered", "tested", ]] # Convert recovered into a number data.recovered = data.recovered.apply( lambda x: int(x.replace("-", "0"))) # Compute the daily counts data["key"] = "LU" data_new = grouped_diff(data[["key", "date", "deceased"]], ["key", "date"]) data_cum = grouped_cumsum(data[["key", "date", "tested", "recovered"]], ["key", "date"]) data_cur = data[[ "key", "date", "current_hospitalized", "current_intensive_care" ]] data = data_new.merge(data_cum, how="outer").merge(data_cur, how="outer") # Output the results return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = dataframes[0].rename( columns={ "Date_of_report": "date", "Municipality_code": "subregion2_code", "Municipality_name": "subregion2_name", "Province": "subregion1_name", "Total_reported": "confirmed", "Hospital_admission": "hospitalized", "Deceased": "deceased", }) # Drop data without a clear demarcation data = data[~data.subregion1_name.isna()] data = data[~data.subregion2_code.isna()] data = data[~data.subregion2_name.isna()] # Get date in ISO format data.date = data.date.apply( lambda x: datetime.fromisoformat(x).date().isoformat()) # Make sure the region code is zero-padded and without prefix data["subregion2_code"] = data["subregion2_code"].apply( lambda x: x[2:]) data = data.drop(columns=["subregion1_name", "subregion2_name"]) data = data.merge(aux["metadata"], on="subregion2_code") # We only need to keep key-date pair for identification data = data[["date", "key", "confirmed", "deceased", "hospitalized"]] # Compute the daily counts data = grouped_diff(data, ["key", "date"]) # Group by level 2 region, and add the parts l2 = data.copy() l2["key"] = l2.key.apply(lambda x: x[:5]) l2 = l2.groupby(["key", "date"]).sum().reset_index() # Group by country level, and add the parts l1 = l2.copy().drop(columns=["key"]) l1 = l1.groupby("date").sum().reset_index() l1["key"] = "NL" # Output the results return concat([l1, l2, data])
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = pivot_table( dataframes[0].set_index("date"), pivot_name="match_string").rename(columns={"value": "confirmed"}) # Remove cities from output data = data[~data.match_string.isin(["La Guaira", "Los Roques"])] # Compute daily differences data = grouped_diff(data, ["match_string", "date"]) # Add country code and return data["country_code"] = "VE" return data
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Retrieve the CSV files from https://covid19.isciii.es df = (read_file(sources[0], error_bad_lines=False, encoding="ISO-8859-1").rename( columns={ "FECHA": "date", "CCAA": "subregion1_code", "Fallecidos": "deceased", "Hospitalizados": "hospitalized", "UCI": "ICU", "Recuperados": "recovered", }).dropna(subset=["date"])) # Confirmed cases are split across 2 columns confirmed_columns = ["CASOS", "PCR+"] for col in confirmed_columns: df[col] = df[col].fillna(0) df["confirmed"] = df.apply( lambda x: sum([x[col] for col in confirmed_columns]), axis=1) # Convert dates to ISO format df["date"] = df["date"].apply( lambda date: datetime_isoformat(date, "%d/%m/%Y")) # Reported cases are cumulative, compute the diff df = grouped_diff(df, ["subregion1_code", "date"]) # Add the country code to all records df["country_code"] = "ES" # Country-wide is the sum of all regions country_level = (df.drop(columns=["subregion1_code"]).groupby( ["date", "country_code"]).sum().reset_index()) country_level["subregion1_code"] = None df = concat([country_level, df]) # Output the results return df
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: df = dataframes[0] df.columns = df.iloc[0] df = df.rename(columns={"Provinsi": "date"}) df = df.iloc[1:].set_index("date") df = df[df.columns.dropna()] df = pivot_table(df.transpose(), pivot_name="match_string") df["date"] = df["date"].apply(_parse_date) df = df.dropna(subset=["date"]) df = df.rename(columns={"value": "confirmed"}) df["confirmed"] = df["confirmed"].apply(safe_int_cast).astype("Int64") keep_columns = ["date", "match_string", "confirmed"] df = df[df["match_string"] != "Total"] df = df[df["match_string"] != "Dalam proses investigasi"] df = grouped_diff(df[keep_columns], ["match_string", "date"]) df["country_code"] = "ID" return df
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = dataframes[0].rename( columns={ "datetime": "date", "country": "country_code", "nuts_2": "match_string", "nuts_3": "match_string", "cases": "confirmed", "deaths": "deceased", "tests": "tested", "recovered": "recovered", "hospitalized": "hospitalized", "intensive_care": "icu", }) data["date"] = data["date"].apply(lambda x: datetime.fromisoformat(x)) data["date"] = data["date"].apply(lambda x: x.date().isoformat()) # Remove bogus data blacklist = ("unknown", "unknown county", "nezjištěno", "outside mainland norway") data = data.dropna(subset=["match_string"]) data.match_string = data.match_string.str.lower() data = data[~data["match_string"].isin(blacklist)] data = data[~data["match_string"]. apply(lambda x: len(x) == 0 or x.startswith("http"))] # Remove unnecessary columns data = data[[col for col in data.columns if not "/" in col]] # Some tables have repeated data data = data.groupby(["country_code", "match_string", "date"]).last().reset_index() return grouped_diff( data, ["country_code", "match_string", "date"], skip=["tests", "recovered", "hospitalized", "icu"], )
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Retrieve the CSV files from https://covid19.isciii.es data = (read_file(sources[0], error_bad_lines=False, encoding="ISO-8859-1").rename( columns={ "FECHA": "date", "CCAA": "subregion1_code", "Fallecidos": "deceased", "Hospitalizados": "hospitalized", "UCI": "intensive_care", }).dropna(subset=["date"])) # Confirmed cases are split across 2 columns confirmed_columns = ["CASOS", "PCR+"] for col in confirmed_columns: data[col] = data[col].fillna(0) data["confirmed"] = data.apply( lambda x: sum([x[col] for col in confirmed_columns]), axis=1) # Convert dates to ISO format data["date"] = data["date"].apply( lambda date: datetime_isoformat(date, "%d/%m/%Y")) # Keep only the columns we can process data = data[[ "date", "subregion1_code", "confirmed", "deceased", "hospitalized", "intensive_care" ]] # Reported cases are cumulative, compute the diff data = grouped_diff(data, ["subregion1_code", "date"]) # Add the country code to all records data["country_code"] = "ES" # Output the results return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: join_keys = ["fecha", "CCAA"] join_opts = {"on": join_keys, "how": "outer"} data = dataframes[0] data = merge(data, dataframes[1], suffixes=("confirmed", "deceased"), **join_opts) data = merge(data, dataframes[2], suffixes=("", ""), **join_opts) data["country_code"] = "ES" data = data.rename( columns={ "fecha": "date", "CCAA": "match_string", "totalconfirmed": "confirmed", "totaldeceased": "deceased", "total": "hospitalized", }).sort_values(["match_string", "date"]) # Data is cumulative, compute the diff data = grouped_diff(data, ["match_string", "date"]) # Compute the country-level stats by adding all subregions data_country = data.groupby(["date", "country_code"]).sum().reset_index() data_country["match_string"] = "total" data = concat([data, data_country]) return data[[ "date", "country_code", "match_string", "confirmed", "deceased", "hospitalized", ]]
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = dataframes[0].rename(columns=_column_map) # Parse date into a datetime object data["date"] = data["date"].apply( lambda date: datetime.fromisoformat(date).date()) # Convert dates to ISO format data["date"] = data["date"].apply(lambda date: date.isoformat()) # Keep only data we can process data = data[[ "date", "intensive_care", "hospitalized", "quarantined", "new_confirmed", "recovered", "deceased", "total_confirmed", "tested", ]] # Compute the daily counts data["country_code"] = "IT" key_columns = ["country_code", "date"] skip_columns = ["new_confirmed", "total_confirmed"] data = grouped_diff(data, key_columns, skip=skip_columns) # Make sure all records have the country code and null region code data["subregion1_code"] = None # Output the results return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: column_map = { "date": "date", "state": "subregion1_code", "positive": "confirmed", "death": "deceased", "total": "tested", "recovered": "recovered", "hospitalizedCurrently": "currently_hospitalized", "hospitalizedCumulative": "hospitalized", "inIcuCurrently": "currently_intensive_care", "inIcuCumulative": "intensive_care", "onVentilatorCurrently": "currently_ventilator", "onVentilatorCumulative": "ventilator", } # Rename the appropriate columns data = dataframes[0].drop(columns=["hospitalized"]).rename( columns=column_map) # Convert date to ISO format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y%m%d")) # Keep only columns we can process data["key"] = "US_" + data["subregion1_code"] data = data[["key"] + list(column_map.values())].drop( columns=["subregion1_code"]) # Compute the daily counts curr_columns = [col for col in data.columns if "current" in col] data = grouped_diff(data, ["key", "date"], skip=curr_columns) # Output the results return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: df = dataframes[0] df["date"] = df["data"].apply( lambda x: datetime_isoformat(x, "%d-%m-%Y")) # Extract regions from the data regions = [ col.split("_")[-1] for col in df.columns if col.startswith("confirmados_") ] regions = [ region for region in regions if len(region) > 2 and region not in ("novos", "estrangeiro") ] # Aggregate regions into a single data frame subsets = [] for region in regions: subset = df[["date"] + [ "{}_{}".format(col, region) for col in ("confirmados", "obitos", "recuperados") ]].copy() subset["match_string"] = region.replace("ars", "") subset = subset.rename( columns={ "confirmados_%s" % region: "confirmed", "obitos_%s" % region: "deceased", "recuperados_%s" % region: "recovered", }) subsets.append(subset) df = concat(subsets) df = grouped_diff(df, ["match_string", "date"]) df["country_code"] = "PT" return df