def read(source: str) -> pd.Series: soup = vaxutils.get_soup(source) link = parse_infogram_link(soup) soup = vaxutils.get_soup(link) infogram_data = parse_infogram_data(soup) return pd.Series({ "total_vaccinations": parse_infogram_doses(infogram_data), "date": parse_infogram_date(infogram_data), "source_url": source })
def read(source: str, num_pages_limit: int = 10): # Load page for page_nr in range(1, num_pages_limit): # Get soup url = f"{source}/{page_nr}/" soup = vaxutils.get_soup(url) # Get data ds = parse_data(soup) if ds is not None: return ds raise Exception( "No news page with vaccination data was found. Check URLs.")
def connect_parse_data(source: str) -> pd.Series: soup = vaxutils.get_soup(source) people_vaccinated = soup.find(class_="count-up").text people_vaccinated = vaxutils.clean_count(people_vaccinated) total_vaccinations = people_vaccinated return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, })
def read(source: str, last_update: str, num_pages_limit: int = 10): records = [] for page_nr in range(1, num_pages_limit): # Get soup url = f"{source}/{page_nr}/" soup = vaxutils.get_soup(url) # Get data (if any) records_sub = parse_data(soup) if records_sub: records.extend(records_sub) if any([record["date"] <= last_update for record in records_sub]): break if len(records) > 0: records = [record for record in records if record["date"] >= last_update] if len(records) > 0: return pd.DataFrame(records) return None
def read(source: str) -> pd.Series: soup = vaxutils.get_soup(source) total_vaccinations = vaxutils.clean_count(soup.find(class_="stats-decoration-title").text) people_vaccinated = total_vaccinations people_fully_vaccinated = 0 date = re.search(r"\d+ \w+ 202\d", soup.find(class_="stats-decoration-text").text).group(0) date = vaxutils.clean_date(date, "%d %B %Y") return pd.Series(data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, })
def read(source: str) -> pd.Series: soup = vaxutils.get_soup(source) blocks = soup.find_all(class_="aly_tx_center") for block in blocks: if "医療従事者等:" in block.text: healthcare_workers = vaxutils.clean_count(block.find("font").text) elif "高齢者:" in block.text: elderly = vaxutils.clean_count(block.find("font").text) total_vaccinations = healthcare_workers + elderly return pd.Series(data={ "total_vaccinations": total_vaccinations, })
def parse_vaccinations(elem) -> dict: # Get news text url = elem.find_parent(class_="card").find("a").get("href") soup = vaxutils.get_soup(url) text = "\n".join([p.text for p in soup.find("article").find_all("p")]) # Find metrics metrics = dict() total_vaccinations = re.search(r"疫苗共有(?P<count>[\d,]*)人次", text) people_vaccinated = re.search(r"1劑疫苗共有(?P<count>[\d,]*)人次", text) people_fully_vaccinated = re.search(r"2劑疫苗共有(?P<count>[\d,]*)人次", text) if total_vaccinations: metrics["total_vaccinations"] = vaxutils.clean_count(total_vaccinations.group(1)) if people_vaccinated: metrics["people_vaccinated"] = vaxutils.clean_count(people_vaccinated.group(1)) if people_fully_vaccinated: metrics["people_fully_vaccinated"] = vaxutils.clean_count(people_fully_vaccinated.group(1)) return metrics
def connect_parse_data(source: str) -> pd.Series: soup = vaxutils.get_soup(source) tables = pd.read_html(str(soup)) for table in tables: if table.iloc[0, 0] == "عدد متلقي اللقاح": people_vaccinated = vaxutils.clean_count(table.iloc[1, 0]) elif table.iloc[0, 0] == "عدد المطعمين بشكل كامل": people_fully_vaccinated = vaxutils.clean_count(table.iloc[1, 0]) elif table.iloc[0, 0] == "عدد الجرعات": total_vaccinations = vaxutils.clean_count(table.iloc[1, 0]) return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, })
def read(source: str): soup = vaxutils.get_soup(source) url = parse_pdf_link(soup, source) ds = pd.Series(parse_data(url)) return ds