def fetch( self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]], skip_existing: bool = False, ) -> Dict[str, str]: # Data is published as snapshots, so we guess the URL based on the date opts = dict(fetch_opts[0]) url_tpl = opts.pop("url") # Keep trying URLs in reverse chronological order starting today until one works url = None date_start = "2021-08-31" date_end = date_today(offset=1) for date in reversed(list(date_range(date_start, date_end))): url = url_tpl.format(date=date.replace("-", "")) res = requests.head(url) if res.status_code == 200 and int( res.headers.get("Content-Length", "0")) > 0: # Pass the actual URLs down to fetch it url_opts = dict(url=url, **opts) return super().fetch(output_folder, cache, [url_opts], skip_existing=skip_existing)
def _process_partition(cases: DataFrame) -> DataFrame: cases = cases.copy() # Extract information about whether doses were first (partial immunization) or second (full) cases["date_new_persons_vaccinated"] = None cases["date_new_persons_fully_vaccinated"] = None first_dose_mask = cases["_dose_information"].str.strip().str.slice( 0, 1) == "1" second_dose_mask = cases["_dose_information"].str.strip().str.slice( 0, 1) == "2" cases.loc[first_dose_mask, "date_new_persons_vaccinated"] = cases.loc[ first_dose_mask, "date_new_vaccine_doses_administered"] cases.loc[second_dose_mask, "date_new_persons_fully_vaccinated"] = cases.loc[ second_dose_mask, "date_new_vaccine_doses_administered"] # Drop columns which we have no use for cases = cases[[col for col in cases.columns if not col.startswith("_")]] # Make sure our region codes are of type str cases["subregion2_code"] = cases["subregion2_code"].apply( safe_int_cast).astype(str) # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = cases["sex"].str.lower().apply({ "m": "male", "f": "female" }.get) # Convert to time series format data = convert_cases_to_time_series( cases, index_columns=["subregion1_code", "subregion2_code"]) # Convert date to ISO format data["date"] = data["date"].str.slice(0, 10) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) # Get rid of bogus records data = data.dropna(subset=["date"]) data = data[data["date"] >= "2020-01-01"] data = data[data["date"] < date_today(offset=1)] # Aggregate data by country country = aggregate_admin_level(data, ["date", "age", "sex"], "country") country["key"] = "BR" # Aggregate data by state state = (data.drop(columns=["subregion2_code"]).groupby( ["date", "subregion1_code", "age", "sex"]).sum().reset_index()) state["key"] = "BR_" + state["subregion1_code"] # We can derive the key from subregion1 + subregion2 data = data[data["subregion2_code"].notna() & (data["subregion2_code"] != "")] data["key"] = "BR_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] return concat([country, state, data])
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Data can only be retrieved one day at a time, and it starts on 2020-01-22 first = "2020-01-22" map_iter = list(date_range(first, date_today())) records = sum(thread_map(_get_daily_records, map_iter), []) return DataFrame.from_records(records)
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Data is nested into multiple sheets tables = [] for df in list(dataframes[0].values()): # Header has two rows, but we ignore them and use our own columns anyway df.columns = _columns df = df.iloc[2:].copy() # Keep only rows with indexable columns not null df.dropna(subset=["date", "subregion2_name"], inplace=True) # Add to the tables including all subregions tables.append(df.iloc[1:]) # Put all sheets together into a single DataFrame data = concat(tables) # Ensure date is in ISO format data["date"] = data["date"].apply(lambda x: str(x)[:10]) # Make sure that all data is numeric for col in data.columns: if col not in ("date", "subregion2_name"): data[col] = data[col].apply(safe_int_cast) # Filter out dates beyond today data = data[data["date"] < date_today(offset=1)] # Output the results data["country_code"] = "SL" return data
def fetch( self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]], skip_existing: bool = False, ) -> Dict[str, str]: # Data is published as GitHub Releases, so we guess the URL based on the date opts = dict(fetch_opts[0]) url_tpl = opts.pop("url") urls = [] date_start = "2021-01-11" date_end = date_today(offset=1) for date in date_range(date_start, date_end): urls.append( dict(name=date, url=url_tpl.format(date=date.replace("-", "")), **opts)) # Pass the actual URLs down to fetch it return super().fetch(output_folder, cache, urls, skip_existing=skip_existing)
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = table_rename(dataframes[0], _srag_column_adapter, drop=True) covid_mask = cases["_classification"] == 5 valid_mask = cases["_prognosis"].notna() & cases["_prognosis"] != 9 cases = cases[covid_mask & valid_mask] # Record the date of death cases["date_new_deceased"] = None deceased_mask = cases["_prognosis"] == 2 cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "_date_prognosis"] # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = cases["sex"].apply({"M": "male", "F": "female"}.get) # Convert all dates to ISO format for col in filter(lambda x: x.startswith("date"), cases.columns): cases[col] = cases[col].apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # Parse subregion codes cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 5)) # Convert to time series format data = convert_cases_to_time_series(cases, index_columns=["subregion2_code"]) data["country_code"] = "BR" # Get rid of bogus records data = data.dropna(subset=["date"]) data = data[data["date"] >= "2020-01-01"] data = data[data["date"] < date_today(offset=1)] # Aggregate by country level country = (data.drop(columns=["subregion2_code"]).groupby( ["date", "age", "sex"]).sum().reset_index()) country["key"] = "BR" # Aggregate by state level data["subregion1_code"] = data["subregion2_code"].apply( lambda x: _IBGE_STATES.get(safe_int_cast(x[:2]))) state = (data.drop(columns=["subregion2_code"]).dropna( subset=["subregion1_code"]).groupby( ["date", "subregion1_code", "age", "sex"]).sum().reset_index()) state["key"] = "BR_" + state["subregion1_code"] # Derive the key from subregion codes data = data[data["subregion2_code"].notna()] data["key"] = "BR_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] return concat([country, state, data])
def fetch( self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]], skip_existing: bool = False, ) -> Dict[str, str]: # Data is published as GitHub Releases, so we guess the URL based on the date opts = fetch_opts[0] url_tpl = opts["url"] # Go from <today + 1> until the last known date for which data is reported # NOTE: at the time of writing, last known date is October 20 working_url = None last_known_date = "2020-10-20" latest_date = date_today(offset=1) for date in reversed(list(date_range(last_known_date, latest_date))): try: url_test = url_tpl.format(date=date.replace("-", ".")) self.log_debug(f"Trying {url_test}") res = requests.get(url_test, timeout=60) if res.ok: working_url = url_test break except: continue # Make sure that we found a working URL assert working_url is not None, f"No working URL found for DXY data source" # Pass the actual URL down to fetch it return super().fetch(output_folder, cache, [{ **opts, "url": working_url }], skip_existing=skip_existing)
def fetch( self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]], skip_existing: bool = False, ) -> Dict[str, str]: # Data is published as daily snapshots, so we guess the URL based on the date opts = dict(fetch_opts[0]) url_tpl = opts.pop("url") urls = [] date_start = "2020-05-06" date_end = date_today(offset=1) for date in date_range(date_start, date_end): datestr = "".join(reversed(date.split("-"))) urls.append( dict(name=date, url=url_tpl.format(date=datestr), **opts)) # Pass the actual URLs down to fetch it return super().fetch(output_folder, cache, urls, skip_existing=skip_existing)
def _process_partition(cases: DataFrame) -> DataFrame: cases = cases.copy() # Confirmed cases are only those with a confirmed positive test result cases["date_new_confirmed"] = None confirmed_mask = cases["_test_result"] == "Positivo" cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[confirmed_mask, "date_new_tested"] # Do not process deceased counts, since they are considered highly inaccurate # # Deceased cases have a specific label and the date is the "closing" date # cases["date_new_deceased"] = None # deceased_mask = cases["_prognosis"] == "Óbito" # cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "_date_update"] # # Only count deceased cases from confirmed subjects # cases.loc[~confirmed_mask, "date_new_deceased"] = None # Recovered cases have a specific label and the date is the "closing" date cases["date_new_recovered"] = None recovered_mask = cases["_prognosis"] == "Cured" cases.loc[recovered_mask, "date_new_recovered"] = cases.loc[recovered_mask, "_date_update"] # Only count recovered cases from confirmed subjects cases.loc[~confirmed_mask, "date_new_recovered"] = None # Drop columns which we have no use for cases = cases[[col for col in cases.columns if not col.startswith("_")]] # Make sure our region codes are of type str cases["subregion2_code"] = cases["subregion2_code"].apply(safe_int_cast) # The last digit of the region code is actually not necessary cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: None if isna(x) else str(int(x))[:-1] ) # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = cases["sex"].str.lower().apply({"masculino": "male", "feminino": "female"}.get) # Convert to time series format data = convert_cases_to_time_series(cases, index_columns=["subregion1_code", "subregion2_code"]) # Convert date to ISO format data["date"] = data["date"].str.slice(0, 10) data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y-%m-%d")) # Get rid of bogus records data = data.dropna(subset=["date"]) data = data[data["date"] >= "2020-01-01"] data = data[data["date"] < date_today(offset=1)] # Aggregate data by country country = ( data.drop(columns=["subregion1_code", "subregion2_code"]) .groupby(["date", "age", "sex"]) .sum() .reset_index() ) country["key"] = "BR" # Aggregate data by state state = ( data.drop(columns=["subregion2_code"]) .groupby(["date", "subregion1_code", "age", "sex"]) .sum() .reset_index() ) state["key"] = "BR_" + state["subregion1_code"] # We can derive the key from subregion1 + subregion2 data = data[data["subregion2_code"].notna() & (data["subregion2_code"] != "")] data["key"] = "BR_" + data["subregion1_code"] + "_" + data["subregion2_code"] return concat([country, state, data])