def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = dataframes[0].rename(columns={ "Fecha": "date" }).set_index("date") deceased_columns = [col for col in data.columns if col.endswith("_D")] confirmed_columns = [col[:-2] for col in deceased_columns] deceased = data[deceased_columns] confirmed = data[confirmed_columns] deceased.columns = confirmed.columns deceased = pivot_table( deceased, pivot_name="subregion1_code").rename(columns={"value": "deceased"}) confirmed = pivot_table(confirmed, pivot_name="subregion1_code").rename( columns={"value": "confirmed"}) data = confirmed.merge(deceased).sort_values( ["date", "subregion1_code"]) # Output the results data = grouped_diff(data, ["subregion1_code", "date"]) data["country_code"] = "MX" return data
def parse_dataframes( self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = dataframes[0] column_tokens = ["confirmed_", "deaths_", "recovered_"] data = data[[col for col in data.columns if any(token in col for token in column_tokens)]] data = data.drop( columns=["cases_confirmed_new", "cases_unconfirmed_new", "deaths_new", "recovered_new"] ) data["date"] = dataframes[0].date.apply(lambda x: datetime_isoformat(x, "%d-%m-%Y")) subsets = [] for token in column_tokens: df = data[["date"] + [col for col in data.columns if token in col]] df = pivot_table(df.set_index("date"), pivot_name="match_string") df.match_string = df.match_string.apply(lambda x: x.split("_", 2)[1]) df = df.rename(columns={"value": token.split("_")[0]}) subsets.append(df) data = subsets[0] for df in subsets[1:]: data = data.merge(df, how="outer") data = data.rename(columns={"deaths": "deceased"}) data = data[data.match_string != "unconfirmed"] data = grouped_diff(data, ["match_string", "date"]) data["country_code"] = "PT" return data
def _parse(file_path: str, sheet_name: str, value_name: str): data = read_file(file_path, sheet_name=sheet_name) data.columns = [col.replace("NHS ", "").replace( " total", "") for col in data.iloc[1]] # Drop Golden Jubilee National Hospital - it has no hospitalizations and does not fit # any current matches in metadata.csv. data = data.drop(columns=["Golden Jubilee National Hospital"]) data = data.iloc[2:].rename(columns={"Date": "date"}) data = pivot_table(data.set_index("date"), pivot_name="match_string") data = data.rename(columns={"value": value_name}) data[value_name] = data[value_name].replace( "*", None).apply(safe_float_cast).astype(float) # Get date in ISO format data.date = data.date.apply(lambda x: x.date().isoformat()) # Add metadata data["key"] = None data["country_code"] = "GB" data["subregion1_code"] = "SCT" l2_mask = data.match_string == "Scotland" data.loc[l2_mask, "key"] = "GB_SCT" return data
def _parse(file_path: str, sheet_name: str, value_name: str): data = read_file(file_path, sheet_name=sheet_name) data.columns = [ col.replace("NHS ", "").replace(" total", "") for col in data.iloc[1] ] data = data.iloc[2:].rename(columns={"Date": "date"}) data = pivot_table(data.set_index("date"), pivot_name="match_string") data = data.rename(columns={"value": value_name}) data[value_name] = data[value_name].replace( "*", None).apply(safe_float_cast).astype(float) # Get date in ISO format data.date = data.date.apply(lambda x: x.date().isoformat()) # Compute cumsum of values data = grouped_cumsum(data, ["match_string", "date"]) # Add metadata data["key"] = None data["country_code"] = "GB" data["subregion1_code"] = "SCT" l2_mask = data.match_string == "Scotland" data.loc[l2_mask, "key"] = "GB_SCT" return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: deceased = table_rename(dataframes["deceased"], {"FECHA / CCAA": "date"}) deceased = pivot_table(deceased.set_index("date"), value_name="new_deceased", pivot_name="match_string") # Convert dates to ISO format deceased["date"] = deceased["date"].apply(lambda x: str(x)[:10]) deceased["date"] = deceased["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) # Add the country code to all records and declare matching as subregion1 deceased["country_code"] = "ES" deceased["subregion2_code"] = None deceased["locality_code"] = None # Country level is declared as "espana" deceased["key"] = None deceased.loc[deceased["match_string"] == "espana", "key"] = "ES" # Output the results return deceased.dropna(subset=["date"])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = pivot_table(dataframes[0].set_index("date"), pivot_name="match_string").rename( columns={"value": "total_confirmed"}) # Remove cities from output data = data[~data.match_string.isin(["La Guaira", "Los Roques"])] # Add country code and return data["country_code"] = "VE" return data
def _parse_pivot(data: DataFrame, name: str): # Remove bogus values data = data.iloc[:, :-4] # Convert date to ISO format data["date"] = data["date"].apply(lambda x: datetime_isoformat(str(x), "%Y%m%d")) data = pivot_table(data.set_index("date")).rename( columns={"value": name, "pivot": "match_string"} ) # Add the country code to all records data["country_code"] = "JP" # Output the results return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = read_file(sources[0], sheet_name="Antal per dag region").rename( columns={"Statistikdatum": "date"}) # Get date in ISO format data.date = data.date.astype(str) # Unpivot the regions which are columns data.columns = [col.replace("_", " ") for col in data.columns] data = data.drop(columns=["Totalt antal fall"]).set_index("date") data = pivot_table(data, pivot_name="match_string") data["country_code"] = "SE" return data.rename(columns={"value": "new_confirmed"})
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: deceased = table_rename(dataframes["deceased"], {"FECHA / CCAA": "date"}) deceased = pivot_table(deceased.set_index("date"), value_name="new_deceased", pivot_name="match_string") # Convert dates to ISO format deceased["date"] = deceased["date"].apply(lambda x: str(x)[:10]) # Add the country code to all records deceased["country_code"] = "ES" # Output the results return deceased
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = None for column_name, df in dataframes.items(): df = df.rename(columns={"Fecha": "date"}).set_index("date") df = pivot_table(df, pivot_name="match_string").rename( columns={"value": column_name}) if data is None: data = df else: data = data.merge(df, how="left") # Country-level have a specific label data["country_code"] = "MX" data["subregion2_code"] = None data.loc[data.match_string == "Nacional", "key"] = "MX" return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: df = dataframes[0] df.columns = df.iloc[0] df = df.rename(columns={"Provinsi": "date"}) df = df.iloc[1:].set_index("date") df = df[df.columns.dropna()] df = pivot_table(df.transpose(), pivot_name="match_string") df["date"] = df["date"].apply(_parse_date) df = df.dropna(subset=["date"]) df = df.rename(columns={"value": "total_confirmed"}) df["total_confirmed"] = df["total_confirmed"].apply( safe_int_cast).astype("Int64") df = df[["date", "match_string", "total_confirmed"]] df = df[df["match_string"] != "Total"] df = df[df["match_string"] != "Dalam proses investigasi"] df["country_code"] = "ID" return df
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = None ordered_columns = [ "confirmed", "deceased", "tested", "hospitalized", "intensive_care" ] for column_name, df in zip(ordered_columns, dataframes): df = df.rename(columns={"Fecha": "date"}).set_index("date") df = pivot_table(df, pivot_name="match_string").rename( columns={"value": column_name}) if data is None: data = df else: data = data.merge(df, how="left") # Compute the cumsum of data data = grouped_cumsum(data, ["match_string", "date"]) data["country_code"] = "MX" # Country-level have a specific label data.loc[data.match_string == "Nacional", "key"] = "MX" return data
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: if parse_opts.get("debug"): print("File name:", sources[0]) # Get the file contents from source html_content = open(sources[0]).read() # We need to set locale in order to parse dates properly locale.setlocale(locale.LC_TIME, parse_opts.get("locale", "en_US") + ".UTF-8") # Tables keep changing order, so iterate through all until one looks good table_count = count_html_tables(html_content, selector="table.wikitable") data: DataFrame = None for table_index in range(table_count): data = read_html( html_content, header=True, selector="table.wikitable", parser=wiki_html_cell_parser, table_index=table_index, skiprows=parse_opts.get("skiprows", 0), ) if parse_opts.get("debug"): print("\n[%d] Data:" % (table_index + 1)) print(data.columns) print(data.head(50)) # Some of the tables are in Spanish data = data.rename(columns={"Fecha": "Date"}) # Set first date column as index, drop other date columns columns_lowercase = [(col or "").lower() for col in data.columns] date_index = columns_lowercase.index("date") if "date" in columns_lowercase else 0 del_index = [i for i, col in enumerate(columns_lowercase) if col == "date"][1:] data = data.iloc[:, [i for i, _ in enumerate(data.columns) if i not in del_index]] data = data.set_index(data.columns[date_index]) # data = data.iloc[:, :-parse_opts.get('skipcols', 0)] if parse_opts.get("droprows") is not None: try: data = data.drop(parse_opts["droprows"].split(",")) except: pass # Pivot the table to fit our preferred format data = pivot_table(data, pivot_name="subregion") data = data[~data["subregion"].isna()] if parse_opts.get("debug"): print("\n[%d] Pivoted:" % (table_index + 1)) print(data.head(50)) # Make sure all dates include year date_format = parse_opts["date_format"] if "%Y" not in date_format: date_format = date_format + "-%Y" data["date"] = data["date"].astype(str) + "-%d" % datetime.datetime.now().year # Parse into datetime object, drop if not possible data["date"] = data["date"].apply(lambda date: safe_datetime_parse(date, date_format)) data = data[~data["date"].isna()] # If the dataframe is not empty, then we found a good one if len(data) > 10 and len(data["subregion"].unique()) > 3: break # Make sure we have *some* data assert data is not None and len(data) > 0 # Convert all dates to ISO format data["date"] = data["date"].apply(lambda date: date.date().isoformat()) # Get the confirmed and deaths data from the table data["confirmed"] = data["value"].apply(lambda x: safe_int_cast(self._parenthesis(x)[0])) data["deceased"] = data["value"].apply(lambda x: safe_int_cast(self._parenthesis(x)[1])) # Add up all the rows with same Date and subregion data = data.sort_values(["date", "subregion"]) data = ( data.drop(columns=["value"]) .groupby(["subregion", "date"]) .agg(self._aggregate_region_values) ) data = data.reset_index().sort_values(["date", "subregion"]) # Add the appropriate columns to the data value_columns = ["confirmed", "deceased"] for col in value_columns: data["new_" + col] = None data["total_" + col] = None # Iterate over the individual subregions to process the values per group for region in data["subregion"].unique(): mask = data["subregion"] == region for column in value_columns: # We can forward-fill values if data is cumsum if parse_opts.get("cumsum"): data.loc[mask, column] = data.loc[mask, column].ffill() # Fill NA with zero to allow for column-wide operations zero_filled = data.loc[mask, column].fillna(0) # Only perform operation if the column is not all NaN if sum(zero_filled) > 0: # Compute diff of the values region by region if required if parse_opts.get("cumsum"): data.loc[mask, "new_" + column] = zero_filled.diff() data.loc[mask, "total_" + column] = zero_filled # If values are already new daily counts, then empty most likely means zero else: data.loc[mask, "new_" + column] = zero_filled data.loc[mask, "total_" + column] = zero_filled.cumsum() # Get rid of rows which have all null values data = data.dropna(how="all", subset=value_columns) # Add the country code to all records data["country_code"] = parse_opts["country"] # Labels can be any arbitrary column name data = data.rename(columns={"subregion": "match_string"}) # Drop columns if requested if "drop_column" in parse_opts: data = data.drop(columns=[parse_opts["drop_column"]]) # Filter out the appropriate levels aggregation_level = parse_opts.get("aggregation_level", 1) if aggregation_level == 1: null_column = "subregion2_code" elif aggregation_level == 2: null_column = "subregion1_code" data[null_column] = None # Remove known values that are just noise data["_match_string"] = data["match_string"].apply(lambda x: x.lower()) data = data[ ~data["_match_string"].isin( [ "cml", "new", "total", "tests", "deaths", "abroad", "airport", "current", "newcases", "acumulado", "repatriated", "totaltested", "confirmed cases", "unassigned\ncases", "airport screening", ] ) ] data = data[~data["_match_string"].apply(lambda x: "princess" in x or "total" in x)] # Output the results if parse_opts.get("debug"): print("\nOutput:") print(data.head(50)) return data
def _parse_variable(data: DataFrame, var_name: str) -> DataFrame: data = data.drop(columns=["YYYYMMDD", "UNKNOWN", "source"]) return pivot_table(data.set_index("date"), pivot_name="subregion1_code", value_name=var_name)