def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: with open(sources[0], "r") as fd: features = json.load(fd)["features"] records = {"hospitalized": [], "intensive_care": [], "ventilator": []} for record in features: if record["SERIE"] == "HPT": statistic = "hospitalized" elif record["SERIE"] == "CSR": statistic = "intensive_care" elif record["SERIE"] == "CCR": statistic = "ventilator" else: self.errlog(f"Unknown statistic type: {statistic}") continue records[statistic].append({ "date": datetime.fromtimestamp(record["FECHA"] / 1000).date().isoformat(), f"current_{statistic}": record["CV19"], }) dataframes = [] for df in records.values(): dataframes.append( DataFrame.from_records(df).groupby("date").sum().reset_index()) data = table_multimerge(dataframes, how="outer") data["key"] = "ES_CN" return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_multimerge( [ table_rename(dataframes["confirmed"], _column_adapter, drop=True), table_rename(dataframes["deceased"], _column_adapter, drop=True), ], how="outer", ) # Province names are sometimes codes (but not always compliant with ISO codes) data["subregion1_code"] = data["subregion1_name"].apply(_province_map.get) data.drop(columns=["subregion1_name"], inplace=True) # Convert date to ISO format data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%d-%m-%Y")) # Aggregate subregion1 level l1_index = ["date", "subregion1_code"] l1 = data.drop(columns=["match_string"]).groupby(l1_index).sum().reset_index() # Make sure all records have the country code and subregion2_name l1["country_code"] = "CA" l1["subregion2_name"] = None data["country_code"] = "CA" data["subregion2_name"] = "" # Remove bogus data data = data[data["match_string"] != "Not Reported"] # Output the results return concat([l1, data])
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_multimerge( [ Covid19ZaCumulativeDataSource._parse_variable(df, name) for df, name in zip( dataframes, [ "total_confirmed", "total_deceased", "total_recovered", "total_tested" ], ) ], how="outer", ) # Convert date to ISO format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%d-%m-%Y")) # Country-level records should have "total" region name country_mask = data["subregion1_code"] == "total" data.loc[country_mask, "key"] = "ZA" # All other records can provide their own key directly data.loc[~country_mask, "key"] = "ZA_" + data.subregion1_code # Output the results return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: with open(sources[0], "r") as fd: features = json.load(fd)["features"] records = {"confirmed": [], "deceased": [], "recovered": []} for record in features: if record["TIPO"] == "Casos": statistic = "confirmed" elif record["TIPO"] == "Fallecidos": statistic = "deceased" elif record["TIPO"] == "Recuperados": statistic = "recovered" else: self.errlog(f"Unknown statistic type: {statistic}") continue records[statistic].append({ "date": datetime.fromtimestamp(record["FECHA"] / 1000).date().isoformat(), "subregion2_code": record["CODMUN"], "subregion2_name": record["MUNICIPIO"], f"new_{statistic}": record["CV19_DIA"], f"total_{statistic}": record["CV19_AC"], "_island": record["ISLA"], }) dataframes = [DataFrame.from_records(df) for df in records.values()] data = table_multimerge(dataframes, how="outer") data["key"] = "ES_CN_" + data["subregion2_code"].astype(str) # Add the country and region code to all records data["country_code"] = "ES" data["subregion1_code"] = "CN" # Aggregate by island and map to known key islands = (data.drop( columns=["key", "subregion2_code", "subregion2_name"]).groupby( ["date", "_island"]).sum().reset_index()) islands["key"] = "ES_CN_" + islands["_island"].apply(_island_map.get) # Aggregate the entire autonomous community l1 = islands.drop( columns=["key", "_island"]).groupby("date").sum().reset_index() l1["key"] = "ES_CN" # Drop bogus values data = data[data["subregion2_code"] != 0] islands = islands[~islands["key"].isna()] return concat([data, islands, l1])
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: # Keep only columns we can process data = table_multimerge( [_parse_pivot(df, name) for name, df in dataframes.items()], how="outer" ) data = data[["date", "country_code", "match_string", "new_confirmed", "new_deceased"]] return data.fillna(0)
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_multimerge( [ table_rename( dataframes["confirmed"], { "Fecha": "date", "Casos confirmados": "new_confirmed", "Codigo region": "subregion1_code", "Codigo comuna": "subregion2_code", }, drop=True, ), table_rename( dataframes["deceased"], { "Fecha": "date", "Casos fallecidos": "total_deceased", "Codigo region": "subregion1_code", "Codigo comuna": "subregion2_code", }, drop=True, ), ], how="outer", ) # Convert date to ISO format data["date"] = data["date"].astype(str) # Parse region codes as strings data["subregion1_code"] = data["subregion1_code"].apply( lambda x: numeric_code_as_string(x, 2) ) data["subregion2_code"] = data["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 5) ) # Use proper ISO codes for the subregion1 level data["subregion1_code"] = data["subregion1_code"].apply(_SUBREGION1_CODE_MAP.get) # Extract cities from the municipalities city = _extract_cities(data) # We can build the key for the data directly from the subregion codes data["key"] = "CL_" + data["subregion1_code"] + "_" + data["subregion2_code"] # Drop bogus records from the data data.dropna(subset=["subregion1_code", "subregion2_code"], inplace=True) return concat([data, city])
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data_list = [] for statistic, source_file in sources.items(): with open(source_file, "r") as fd: df = DataFrame.from_records(json.load(fd)["values"]) data_list.append(table_rename(df, {"value": statistic})) data = table_multimerge(data_list, how="outer") data["key"] = "RO" return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts): # Read all files in the eurostat folder and merge them together eurostat_directory = SRC / "data" / "eurostat" dataframes = [ read_file(file_name) for file_name in eurostat_directory.glob("*.csv") ] data = table_multimerge(dataframes, how="outer").dropna(subset=["key"]) # Use only keys available in metadata return data.merge(aux["metadata"][["key"]], how="inner")
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data_list = [] for idx, var in enumerate( ["total_recovered", "current_intensive_care"]): df = DataFrame.from_records( json.load(open(sources[idx]))["values"]) data_list.append(table_rename(df, {"value": var})) data = table_multimerge(data_list) data["key"] = "RO" return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_multimerge( [ table_rename( dataframes["confirmed"], { "Fecha": "date", "Total": "new_confirmed", "Region": "match_string" }, drop=True, ), table_rename( dataframes["deceased"], # The file name indicates the counts are cumulative, but they are not { "Fecha": "date", "Total": "total_deceased", "Region": "match_string" }, drop=True, ), table_rename( dataframes["tested"], { "Fecha": "date", "numero": "new_tested", "Region": "match_string" }, drop=True, ), ], how="outer", ) # Convert date to ISO format data["date"] = data["date"].astype(str) # Extract cities from the regions city = _extract_cities(data) # Make sure all records have country code and no subregion code data["country_code"] = "CL" data["subregion2_code"] = None # Drop bogus records from the data data.dropna(subset=["date", "match_string"], inplace=True) return concat([data, city])
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: sheets = [] sheet_processors = { "Trends": TexasDataSource._parse_trends, "Tests by day": TexasDataSource._parse_tests, "Hospitalization by Day": TexasDataSource._parse_hospitalized, } for sheet_name, sheet_processor in sheet_processors.items(): df = sheet_processor(read_file(sources[0], sheet_name=sheet_name)) df = df.dropna(subset=["date"]) df.date = df.date.astype(str) df.date = df.date.apply( lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) sheets.append(df) data = table_multimerge(sheets, how="outer") for col in data.columns: if col != "date": data[col] = data[col].apply(safe_float_cast).astype(float) data["key"] = "US_TX" return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases_confirmed = table_rename( dataframes["confirmed"], _column_adapter, drop=True).rename(columns={"date": "date_new_confirmed"}) cases_deceased = table_rename( dataframes["deceased"], _column_adapter, drop=True).rename(columns={"date": "date_new_deceased"}) # Translate sex label for df in (cases_confirmed, cases_deceased): df["sex"] = df["sex"].apply({ "MASCULINO": "male", "FEMENINO": "female" }.get) # Convert to time series index_columns = ["subregion1_name", "province_name", "subregion2_name"] data_confirmed = convert_cases_to_time_series(cases_confirmed, index_columns) data_deceased = convert_cases_to_time_series(cases_deceased, index_columns) # Join into a single dataset data = table_multimerge([data_confirmed, data_deceased], how="outer") # Remove bogus records data.dropna(subset=["date"], inplace=True) # Set country code and get date in ISO format data["country_code"] = "PE" data["date"] = data["date"].apply(safe_int_cast) data["date"] = data["date"].apply(safe_str_cast) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y%m%d")) # Properly capitalize department to allow for exact matching data["subregion1_name"] = data["subregion1_name"].apply( lambda x: _department_map.get(x, x.title())) # Aggregate by admin level 1 subregion1 = (data.drop( columns=["subregion2_name", "province_name"]).groupby( ["date", "country_code", "subregion1_name", "age", "sex"]).sum().reset_index()) subregion1["subregion2_name"] = None # Try to match based on subregion2_name using fuzzy matching, and set subregion2_name to # an empty string to turn off exact matching data = data.rename(columns={"subregion2_name": "match_string"}) data["subregion2_name"] = "" # Convert other text fields to lowercase for consistent processing data["match_string"] = data["match_string"].apply(fuzzy_text) data["province_name"] = data["province_name"].apply(fuzzy_text) # Drop bogus records data = data[~data["match_string"].isna()] data = data[~data["match_string"]. isin(["", "eninvestigacion", "extranjero"])] # Because we are skipping provinces and going directly from region to district, there are # some name collisions which we have to disambiguate manually for province1, province2, district in [ ("lima", "canete", "sanluis"), ("lima", "yauyos", "miraflores"), ("ica", "chincha", "pueblonuevo"), ("canete", "huarochiri", "sanantonio"), ("bolognesi", "huaylas", "huallanca"), ("lucanas", "huancasancos", "sancos"), ("santacruz", "cutervo", "santacruz"), ("yauli", "jauja", "yauli"), ("yauli", "jauja", "paccha"), ("huarochiri", "yauyos", "laraos"), ("elcollao", "melgar", "santarosa"), ]: for province in (province1, province2): mask = (data["province_name"] == province) & (data["match_string"] == district) data.loc[mask, "match_string"] = f"{district}, {province}" # Output the results return concat([subregion1, data])