def connect_parse_data(source: str) -> pd.Series: soup = get_soup(source) counters = soup.find_all(class_="elementor-counter-number") assert len(counters) == 6, "New counter in dashboard?" total_vaccinations = clean_count(counters[0]["data-to-value"]) first_doses = clean_count(counters[1]["data-to-value"]) second_doses = clean_count(counters[2]["data-to-value"]) unique_doses = clean_count(counters[3]["data-to-value"]) booster_shots = clean_count(counters[4]["data-to-value"]) immunocompromised_doses = clean_count(counters[5]["data-to-value"]) people_vaccinated = first_doses + unique_doses people_fully_vaccinated = second_doses + unique_doses total_boosters = booster_shots + immunocompromised_doses date = localdate("America/Jamaica") return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, "date": date, } )
def read(source: str): soup = get_soup(source) url = parse_pdf_link(soup, source) if not url.endswith(".pdf"): raise ValueError(f"File reporting metrics is not a PDF: {url}!") ds = pd.Series(parse_data(url)) return ds
def read(source: str) -> pd.Series: soup = get_soup(source) for block in soup.find(class_="main").find_all(class_="w3-center"): if block.find("p").text == "ΣΥΝΟΛΟ ΕΜΒΟΛΙΑΣΜΩΝ": total_vaccinations = clean_count(block.find_all("p")[1].text) date = re.search(r"[\d/]{8,10}", block.find_all("p")[2].text) date = clean_date(date.group(0), "%d/%m/%Y") if block.find("p").text == "ΣΥΝΟΛΟ 1ης ΔΟΣΗΣ": people_vaccinated = clean_count(block.find_all("p")[1].text) if block.find("p").text == "ΣΥΝΟΛΟ 2ης ΔΟΣΗΣ": people_fully_vaccinated = clean_count(block.find_all("p")[1].text) if block.find("p").text == "ΣΥΝΟΛΟ 3ης ΔΟΣΗΣ": total_boosters = clean_count(block.find_all("p")[1].text) data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, "date": date, "source_url": source, } return pd.Series(data=data)
def read(self, last_update: str) -> pd.DataFrame: # Get Newest Month Report Page url_month = self.base_url_template.format(self._current_month) soup_month = get_soup(url_month) # Get links df = self._parse_data(soup_month, last_update) return df
def _parse_single_doses(self): url = "http://103.247.238.92/webportal/pages/covid19-vaccination-johnson.php" soup = get_soup(url, timeout=30) metrics = self._parse_metrics_raw(soup, raise_err=False) if metrics["people_vaccinated"] != 0: raise ValueError("First dose for one dose vaccines should be 0!") return metrics["people_fully_vaccinated"]
def pipe_total_vax_bfill(self, df: pd.DataFrame, n_days: int) -> pd.DataFrame: soup = get_soup(self.source_url_ref) links = self._get_zip_links(soup) links = links[:n_days] df = self._backfill_total_vaccinations(df, links) return df
def parse_vaccinations(elem) -> dict: # Get news text url = elem.find_parent(class_="card").find("a").get("href") soup = get_soup(url, verify=False) text = "\n".join([p.text for p in soup.find("article").find_all("p")]) # Find metrics metrics = dict() # total_vaccinations = re.search(r"疫苗共有(?P<count>[\d,]*)人次", text) total_vaccinations = re.search(r"疫苗劑數為(?P<count>[\d,]*)劑", text) # print(total_vaccinations) # people_vaccinated = re.search(r"1劑疫苗共有(?P<count>[\d,]*)人次", text) people_vaccinated = re.search(r"已接種人數共有(?P<count>[\d,]*)人", text) # people_fully_vaccinated = re.search(r"2劑疫苗共有(?P<count>[\d,]*)人次", text) # people_fully_vaccinated = re.search(r"已完成接種2劑有(?P<count>[\d,]*)人", text) people_fully_vaccinated = re.search(r"已接種第2劑的?有([\d,]{6,})", text) # people_fully_vaccinated = re.search(r"接種2劑有(?P<count>[\d,]*)人", text) if total_vaccinations: metrics["total_vaccinations"] = clean_count( total_vaccinations.group(1)) if people_vaccinated: metrics["people_vaccinated"] = clean_count(people_vaccinated.group(1)) if people_fully_vaccinated: metrics["people_fully_vaccinated"] = clean_count( people_fully_vaccinated.group(1)) return metrics
def read(self) -> pd.Series: soup = get_soup(self.source_url) data = self.parse_metrics(soup) date_str = self.parse_date(soup) return pd.Series({ **data, "date": date_str, })
def read(self): for cnt in range(1, self._num_max_pages + 1): url = f"{self.source_url}/{cnt}/" soup = get_soup(url, verify=False) data, proceed = self.parse_data(soup) if not proceed: break return pd.Series(data)
def _parse_data(self) -> pd.Series: soup = get_soup(self.source_url) metrics = self._parse_metrics(soup) data = { **metrics, "date": self._parse_date(soup), } return pd.Series(data=data)
def read(self) -> pd.Series: soup = get_soup(self.source_url) link = self.parse_infogram_link(soup) soup = get_soup(link) infogram_data = self.parse_infogram_data(soup) return pd.Series({ "date": self.parse_infogram_date(infogram_data), "source_url": self.source_url, "total_vaccinations": self._parse_total_vaccinations(infogram_data), "people_vaccinated": self._parse_people_vaccinated(infogram_data), "people_fully_vaccinated": self._parse_people_fully_vaccinated(infogram_data), "total_boosters": self._parse_boosters(infogram_data), })
def read(self, last_updated: str) -> pd.DataFrame: soup = get_soup(self.source_url) news_info_all = self._parse_news_info(soup) records = [] for news_info in news_info_all: # print(news_info) if news_info["date"] < last_updated: break records.append(self._parse_metrics(news_info)) return pd.DataFrame(records)
def _parse_pdf_link(self, soup) -> str: for a in soup.find(class_="download").find_all("a"): if "疫苗接種統計資料" in a["title"]: break url_pdf = f"{self.source_url}{a['href']}" for i in range(10): soup = get_soup(url_pdf) a = soup.find(class_="viewer-button") if a is not None: break return f"{self.source_url}{a['href']}"
def _parse_metrics(self, news_info: dict): soup = get_soup(news_info["link"]) text = clean_string(soup.text) metrics = re.search(self.regex["metrics"], text).group(1, 2, 3) return { "total_vaccinations": clean_count(metrics[0]), "people_vaccinated": clean_count(metrics[1]), "people_fully_vaccinated": clean_count(metrics[2]), "source_url": news_info["link"], "date": news_info["date"], }
def vaccines_approved(self, location: str = None, original_names: bool = False) -> list: """Get list of approved vaccines in a country (or all if None specified). Args: location (str, optional): Country name. If None, retrieves all approved vaccines. Defaults to None. original_names (bool, optional): Set to True to keep vaccine from web. Defaults to False. Returns: list: Approved vaccines """ if location: try: url = self.get_country_url(location) soup = get_soup(url) return self._parse_vaccines_location(soup, original_names) except ValueError: return None else: soup = get_soup(self.all_vaccines_url) return self._parse_vaccines_all(soup, original_names)
def parse_data(self, soup: BeautifulSoup) -> tuple: elem = self.get_last_element(soup) if elem is None: return None, True soup = get_soup(elem["link"], verify=False) record = { "source_url": elem["link"], "date": elem["date"], **self.parse_data_news_page(soup), } return record, False
def read(self, last_update: str) -> pd.DataFrame: data = [] for cnt in range(0, self._num_max_pages): # print(f"page: {cnt}") url = f"{self.source_url}/hirek?page={cnt}/" soup = get_soup(url) data_, proceed = self.parse_data(soup, last_update) data.extend(data_) if not proceed: break return pd.DataFrame(data)
def read(self) -> pd.Series: soup = get_soup(self.source_url, timeout=30) metrics = self._parse_metrics(soup) vaccines = self._parse_vaccines(soup) date = localdate("Asia/Dhaka") return pd.Series( data={ **metrics, "date": date, "vaccine": vaccines, } )
def connect_parse_data(source: str) -> pd.Series: soup = get_soup(source) people_vaccinated = soup.find(class_="count-up").text people_vaccinated = clean_count(people_vaccinated) total_vaccinations = people_vaccinated return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, })
def read(self) -> pd.Series: soup = get_soup(self.source_url) ( total_vaccinations, people_vaccinated, people_fully_vaccinated, total_boosters, ) = self._parse_metrics(soup) return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, "source_url": self.source_url, "date": self._parse_date(soup), })
def parse_data(self, soup: BeautifulSoup, last_update: str) -> tuple: elems = self.get_elements(soup) records = [] for elem in elems: if elem["date"] > last_update: # print(elem["date"], elem) soup = get_soup(elem["link"]) record = { "source_url": elem["link"], **self.parse_data_news_page(soup), } records.append(record) else: # print(elem["date"], "END") return records, False return records, True
def read(self) -> pd.DataFrame: soup = get_soup(self.source_url) # Get latest figures from HTML table latest = pd.read_html(str(soup.find("table")))[0] latest_date = re.search(r"Data in this section is as at 11:59pm ([\d]+ [A-Za-z]+ 202\d)", soup.text).group(1) self.latest = pd.DataFrame( { "total_vaccinations": latest.loc[latest["Unnamed: 0"] == "Total doses", "Cumulative total"].item(), "people_vaccinated": latest.loc[latest["Unnamed: 0"] == "First dose", "Cumulative total"].item(), "people_fully_vaccinated": latest.loc[ latest["Unnamed: 0"] == "Second dose", "Cumulative total" ].item(), "date": [clean_date(latest_date, fmt="%d %B %Y", lang="en")], } ) link = self._parse_file_link(soup) df = read_xlsx_from_url(link, sheet_name="Date") return df
def main(paths): soup = get_soup(URL) script = soup.find("script", id="__NEXT_DATA__") data = json.loads(script.string) doses = (pd.DataFrame.from_records( data["props"]["pageProps"]["selectedNlData"] ["vaccine_administered_total"]["values"]).rename( columns={ "date_unix": "date", "estimated": "total_vaccinations" }).drop(columns=["reported", "date_of_insertion_unix"])) doses["date"] = pd.to_datetime(doses.date, unit="s").dt.date.astype(str) coverage = (pd.DataFrame.from_records( data["props"]["pageProps"]["selectedNlData"]["vaccine_coverage"] ["values"]).rename( columns={ "date_end_unix": "date", "fully_vaccinated": "people_fully_vaccinated", "partially_or_fully_vaccinated": "people_vaccinated", }).drop(columns=[ "date_of_insertion_unix", "partially_vaccinated", "date_start_unix" ])) coverage["date"] = pd.to_datetime(coverage.date, unit="s").dt.date.astype(str) df = (pd.merge( doses, coverage, on="date", how="outer", validate="one_to_one").sort_values("date").assign( location="Netherlands", source_url= "https://coronadashboard.government.nl/landelijk/vaccinaties", ).pipe(enrich_vaccine_name)) df = df[(df.total_vaccinations >= df.people_vaccinated) | (df.total_vaccinations.isna()) | (df.people_vaccinated.isna())] df.to_csv(paths.tmp_vax_out("Netherlands"), index=False)
def read(source: str) -> pd.Series: soup = get_soup(source) people_vaccinated = clean_count( re.search(r"^[\d,]+", soup.find_all(class_="info-box-number")[2].text).group(0)) people_fully_vaccinated = clean_count( re.search(r"^[\d,]+", soup.find_all(class_="info-box-number")[3].text).group(0)) total_vaccinations = people_vaccinated + people_fully_vaccinated date = localdate("Asia/Dhaka") return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, })
def main() -> pd.DataFrame: soup = get_soup(METADATA["source_url_ref"]) url = soup.find(class_="informacion").find("a")["href"] url = "https://www.mscbs.gob.es/profesionales/saludPublica/ccayes/alertasActual/nCov/" + url df = pd.read_csv( url, usecols=[ "Fecha", "Unidad", "OCUPADAS_COVID19", "INGRESOS_COVID19", "Provincia", "CCAA" ], encoding="Latin-1", sep=";", ) df["Fecha"] = clean_date_series(df.Fecha, "%d/%m/%Y") df = df.drop_duplicates(subset=["Fecha", "Unidad", "Provincia", "CCAA"], keep="first").dropna(subset=["Unidad"]) df.loc[df.Unidad.str.contains("U. Críticas"), "Unidad"] = "ICU" df = (df.drop(columns=["Provincia", "CCAA"]).groupby( ["Fecha", "Unidad"], as_index=False).sum().sort_values("Unidad").pivot( index="Fecha", columns="Unidad").reset_index().sort_values("Fecha")) df.columns = ["date", "hosp_stock", "icu_stock", "hosp_flow", "icu_flow"] df["hosp_flow"] = df.hosp_flow.rolling(7).sum() df["icu_flow"] = df.icu_flow.rolling(7).sum() df = df.melt("date", var_name="indicator").dropna(subset=["value"]) df["indicator"] = df.indicator.replace({ "hosp_flow": "Weekly new hospital admissions", "icu_flow": "Weekly new ICU admissions", "hosp_stock": "Daily hospital occupancy", "icu_stock": "Daily ICU occupancy", }) df["entity"] = METADATA["entity"] return df, METADATA
def read(source: str, last_update: str, num_pages_limit: int = 10): records = [] for page_nr in range(1, num_pages_limit): # print(page_nr) # Get soup url = f"{source}/{page_nr}/" soup = get_soup(url, verify=False) # Get data (if any) records_sub = parse_data(soup) if records_sub: records.extend(records_sub) if any([record["date"] <= last_update for record in records_sub]): # print("Dates exceding! ", str([record["date"] for record in records_sub])) break if pd.Series([r.get("total_vaccinations") for r in records]).notnull().any(): records = [ record for record in records if record["date"] >= last_update ] if len(records) > 0: return postprocess(pd.DataFrame(records)) return None
def read() -> pd.DataFrame: soup = get_soup(METADATA["source_url_hosp"]) zip_url = soup.find("accordions").find("a").get("href") with tempfile.TemporaryDirectory() as tf: r = requests.get(zip_url) z = zipfile.ZipFile(io.BytesIO(r.content)) z.extractall(tf) flow = pd.read_csv( os.path.join(tf, "Regionalt_DB", "06_nye_indlaeggelser_pr_region_pr_dag.csv"), encoding="ISO 8859-1", sep=";", usecols=["Dato", "Indlæggelser"], ) stock = pd.read_csv( os.path.join(tf, "Regionalt_DB", "15_indlagte_pr_region_pr_dag.csv"), encoding="ISO 8859-1", sep=";", usecols=["Dato", "Indlagte"], ) return flow, stock
def connect_parse_data(source: str) -> pd.Series: soup = get_soup(source) az_dose1 = clean_count(soup.find_all(class_="yellow")[0].text) az_dose2 = clean_count(soup.find_all(class_="yellow")[1].text) assert az_dose1 >= az_dose2 pfizer_dose1 = clean_count(soup.find_all(class_="yellow")[2].text) pfizer_dose2 = clean_count(soup.find_all(class_="yellow")[3].text) assert pfizer_dose1 >= pfizer_dose2 people_vaccinated = az_dose1 + pfizer_dose1 people_fully_vaccinated = az_dose2 + pfizer_dose2 total_vaccinations = people_vaccinated + people_fully_vaccinated date = localdate("America/St_Lucia") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)
def connect_parse_data(source: str) -> pd.Series: soup = get_soup(source) df = pd.read_html(str(soup.find(class_="vaccination-count")))[0] assert df.shape == (3, 7) values = df.iloc[:, 2].values total_vaccinations = values[0] people_vaccinated = values[1] people_fully_vaccinated = values[2] assert total_vaccinations == people_vaccinated + people_fully_vaccinated date = soup.find(class_="aly_tx_center").text date = localdate("Asia/Tokyo") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)
def read(source: str) -> pd.Series: soup = get_soup(source) return parse_data(soup)