def parse_date(soup: BeautifulSoup) -> str: for h3 in soup.find_all("h3"): if "Vaccination Data" in h3.text: break date = re.search(r"as of (\d+ \w+ \d+)", h3.text).group(1) date = clean_date(date, "%d %b %Y") return date
def connect_parse_data(source: str) -> pd.Series: headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Pragma": "no-cache", "Cache-Control": "no-cache", } soup = BeautifulSoup( requests.get(source, headers=headers).content, "html.parser") total_vaccinations = soup.find(class_="repart-stlucia").text total_vaccinations = clean_count(total_vaccinations) date = soup.find(class_="h2-blue").text date = re.search(r"\w+ +\d+, +202\d", date).group(0) date = clean_date(date, "%B %d, %Y") data = { "total_vaccinations": total_vaccinations, "date": date, } return pd.Series(data=data)
def main(): data = { "location": "Guatemala", "source_url": "https://gtmvigilanciacovid.shinyapps.io/3869aac0fb95d6baf2c80f19f2da5f98", "vaccine": "Moderna, Oxford/AstraZeneca", } op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.maximize_window() # For maximizing window driver.implicitly_wait(20) # gives an implicit wait for 20 seconds driver.get(data["source_url"]) driver.find_element_by_class_name("fa-syringe").click() date = driver.find_element_by_class_name("logo").text dose1 = driver.find_element_by_id("dosisaplicadas1").find_element_by_tag_name("h3").text dose2 = driver.find_element_by_id("dosisaplicadas2").find_element_by_tag_name("h3").text data["people_vaccinated"] = clean_count(dose1) data["people_fully_vaccinated"] = clean_count(dose2) data["total_vaccinations"] = data["people_vaccinated"] + data["people_fully_vaccinated"] date = re.search(r"\d+/\d+/202\d", date).group(0) data["date"] = clean_date(date, "%d/%m/%Y") increment( location=data["location"], total_vaccinations=data["total_vaccinations"], people_vaccinated=data["people_vaccinated"], people_fully_vaccinated=data["people_fully_vaccinated"], date=data["date"], source_url=data["source_url"], vaccine=data["vaccine"], )
def connect_parse_data(source: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(10) date = driver.find_element_by_class_name( "as_of").find_element_by_tag_name("span").text date = clean_date(date, "%d.%m.%Y") for elem in driver.find_elements_by_class_name("counter_block"): if "1 ДОЗУ" in elem.text: people_vaccinated = elem.find_element_by_tag_name("h2").text if "2 ДОЗИ" in elem.text: people_fully_vaccinated = elem.find_element_by_tag_name( "h2").text data = { "people_vaccinated": clean_count(people_vaccinated), "people_fully_vaccinated": clean_count(people_fully_vaccinated), "date": date, } return pd.Series(data=data)
def parse_data(soup: BeautifulSoup) -> pd.Series: # Get path to newest pdf links = soup.find(class_="rt-article").find_all("a") for link in links: if "sitrep-sl-en" in link["href"]: pdf_path = "https://www.epid.gov.lk" + link["href"] break tf = tempfile.NamedTemporaryFile() with open(tf.name, mode="wb") as f: f.write(requests.get(pdf_path).content) with open(tf.name, mode="rb") as f: reader = PyPDF2.PdfFileReader(f) page = reader.getPage(0) text = page.extractText().replace("\n", "") regex = r"COVID-19\s+Total\s+Vaccinated\s+(\d+)" total_vaccinations = re.search(regex, text).group(1) total_vaccinations = clean_count(total_vaccinations) people_vaccinated = total_vaccinations regex = r"Situation Report\s+([\d\.]{10})" date = re.search(regex, text).group(1) date = clean_date(date, "%d.%m.%Y") return pd.Series(data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "date": date, "source_url": pdf_path, })
def connect_parse_data(source: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(10) date = re.search(r"Fecha de corte : ([\d/]{10})", driver.page_source).group(1) for block in driver.find_elements_by_class_name("unselectable"): if block.get_attribute("aria-label") == "Dosis aplicadas Card": total_vaccinations = clean_count( block.find_element_by_class_name("value").text) elif block.get_attribute( "aria-label") == "Segundas dosis aplicadas Card": people_fully_vaccinated = clean_count( block.find_element_by_class_name("value").text) people_vaccinated = total_vaccinations - people_fully_vaccinated return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": clean_date(date, "%d/%m/%Y") })
def connect_parse_data(source: str, source_old: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(5) total_vaccinations = driver.find_element_by_id("counter1").text people_vaccinated = driver.find_element_by_id("counter2").text people_fully_vaccinated = driver.find_element_by_id("counter3").text driver.get(source_old) time.sleep(5) # Sanity check total_vaccinations_old = driver.find_element_by_id("counter1").text if total_vaccinations != total_vaccinations_old: raise ValueError("Both dashboards may not be synced and hence may refer to different timestamps. Consider" "Introducing the timestamp manually.") date = driver.find_element_by_id("pupdateddate").text date = clean_date(date.replace("Updated ", ""), "%d %b, %Y") data = { "total_vaccinations": clean_count(total_vaccinations), "people_vaccinated": clean_count(people_vaccinated), "people_fully_vaccinated": clean_count(people_fully_vaccinated), "date": date, } return pd.Series(data=data)
def parse_data(data: dict) -> pd.Series: date = clean_date(data["updated"], "%Y/%m/%d") total_vaccinations = data["progress"] return pd.Series(data={ "date": date, "total_vaccinations": total_vaccinations, })
def parse_data(soup: BeautifulSoup) -> pd.Series: numbers = soup.find_all(class_="odometer") date = re.search(r"[\d\.]{10}", soup.find(class_="counter").text).group(0) date = clean_date(date, "%d.%m.%Y") return pd.Series(data={ "total_vaccinations": int(numbers[0]["data-count"]), "people_vaccinated": int(numbers[1]["data-count"]), "people_fully_vaccinated": int(numbers[2]["data-count"]), "date": date })
def parse_data(data: dict) -> pd.Series: date = clean_date(data["updated"], "%Y/%m/%d") people_vaccinated = data["progress"] people_fully_vaccinated = data["completed"] return pd.Series(data={ "date": date, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "vaccine": ", ".join(_get_vaccine_names(data, translate=True)), })
def connect_parse_data(source: str) -> pd.Series: sheet = open_google_sheet(source) date = sheet.get('C44').first().strip() total_vaccinations = int(sheet.get('K16').first().strip().replace(',', '')) people_fully_vaccinated = int(sheet.get('K27').first().strip().replace(',', '')) people_vaccinated = total_vaccinations - people_fully_vaccinated return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": clean_date(date, "%d/%m/%Y") })
def connect_parse_data(source: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(5) total_vaccinations = driver.find_element_by_id("counter1").text date = driver.find_element_by_id("pupdateddate").text date = clean_date(date.replace("Updated ", ""), "%d %b, %Y") data = { "total_vaccinations": clean_count(total_vaccinations), "date": date, } return pd.Series(data=data)
def parse_data(soup: BeautifulSoup) -> pd.Series: # Get path to newest pdf links = soup.find(class_="rt-article").find_all("a") for link in links: if "sitrep-sl-en" in link["href"]: pdf_path = "https://www.epid.gov.lk" + link["href"] break tf = tempfile.NamedTemporaryFile() with open(tf.name, mode="wb") as f: f.write(requests.get(pdf_path).content) with open(tf.name, mode="rb") as f: reader = PyPDF2.PdfFileReader(f) page = reader.getPage(0) text = page.extractText().replace("\n", "") covishield_data = re.search(r"Covishield Vaccine (\d+) (\d+)", text) covishield_dose1 = clean_count(covishield_data.group(1)) covishield_dose2 = clean_count(covishield_data.group(2)) sinopharm_data = re.search( r"Sinopharm Vaccine \(Chinese Nationals\) (\d+) (\d+)", text) sinopharm_dose1 = clean_count(sinopharm_data.group(1)) sinopharm_dose2 = clean_count(sinopharm_data.group(2)) total_vaccinations = covishield_dose1 + covishield_dose2 + sinopharm_dose1 + sinopharm_dose2 people_vaccinated = covishield_dose1 + sinopharm_dose1 people_fully_vaccinated = covishield_dose2 + sinopharm_dose2 regex = r"Situation Report\s+([\d\.]{10})" date = re.search(regex, text).group(1) date = clean_date(date, "%d.%m.%Y") return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, "source_url": pdf_path, })
def parse_data(soup: BeautifulSoup) -> pd.Series: total_vaccinations = int(soup.find_all(class_="counter")[0].text) people_vaccinated = int(soup.find_all(class_="counter")[1].text) people_fully_vaccinated = int(soup.find_all(class_="counter")[2].text) assert total_vaccinations >= people_vaccinated assert people_vaccinated >= people_fully_vaccinated date = soup.find(class_="fuente").text date = re.search(r"\d{2}-\d{2}-\d{4}", date).group(0) date = clean_date(date, "%d-%m-%Y") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)
def parse_data(soup: BeautifulSoup) -> pd.Series: people_vaccinated = int(soup.find_all(class_="count")[0]["data-count"]) people_fully_vaccinated = int( soup.find_all(class_="count")[1]["data-count"]) assert people_vaccinated >= people_fully_vaccinated total_vaccinations = people_vaccinated + people_fully_vaccinated date = soup.find(class_="reportdate").text date = re.search(r"\d+ \w+ 202\d", date).group(0) date = clean_date(date, "%d %b %Y") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)
def read(source: str) -> pd.Series: headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Pragma": "no-cache", "Cache-Control": "no-cache", } soup = BeautifulSoup( requests.get(source, headers=headers).content, "html.parser") text = soup.find("div", id="data").find("p").text date = re.search(r"На сегодня \(([\d\.]{8})\)", text).group(1) date = clean_date(date, "%d.%m.%y") people_vaccinated = re.search( r"([\d\s]+) чел\. \([\d\.]+% от населения\) - привито хотя бы одним компонентом вакцины", text).group(1) people_vaccinated = clean_count(people_vaccinated) people_fully_vaccinated = re.search( r"([\d\s]+) чел\. \([\d\.]+% от населения\) - полностью привито", text).group(1) people_fully_vaccinated = clean_count(people_fully_vaccinated) total_vaccinations = re.search(r"([\d\s]+) шт\. - всего прививок сделано", text).group(1) total_vaccinations = clean_count(total_vaccinations) return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, })
def read(source: str) -> pd.Series: soup = get_soup(source) counters = soup.find_all(class_="counter") people_partially_vaccinated = clean_count(counters[0].text) people_fully_vaccinated = clean_count(counters[1].text) total_vaccinations = clean_count(counters[2].text) people_vaccinated = people_partially_vaccinated + people_fully_vaccinated date = soup.find("span", id="last-update").text date = re.search(r"\d+.*202\d", date).group(0) date = clean_date(date, "%d %B, %Y") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, "source_url": source, } return pd.Series(data=data)
def read(source: str) -> pd.Series: soup = get_soup(source) for block in soup.find(class_="main").find_all(class_="w3-center"): if block.find("p").text == "ΣΥΝΟΛΟ ΕΜΒΟΛΙΑΣΜΩΝ": total_vaccinations = clean_count(block.find_all("p")[1].text) date = re.search(r"[\d/]{8,10}", block.find_all("p")[2].text) date = clean_date(date.group(0), "%d/%m/%Y") if block.find("p").text == "ΣΥΝΟΛΟ 1ης ΔΟΣΗΣ": people_vaccinated = clean_count(block.find_all("p")[1].text) if block.find("p").text == "ΣΥΝΟΛΟ 2ης ΔΟΣΗΣ": people_fully_vaccinated = clean_count(block.find_all("p")[1].text) data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, "source_url": source, } return pd.Series(data=data)
def parse_data(soup: BeautifulSoup) -> pd.Series: # Get Newest PDF Report Link latest_report_link = soup.find("div", class_="col-lg-12", id="content-detail").find("a")["href"] tf = tempfile.NamedTemporaryFile() with open(tf.name, mode="wb") as f: f.write(requests.get(latest_report_link).content) with open(tf.name, mode="rb") as f: viewer = SimplePDFViewer(f) viewer.render() raw_text = "".join(viewer.canvas.strings) special_char_replace = { '\uf701': u'\u0e34', '\uf702': u'\u0e35', '\uf703': u'\u0e36', '\uf704': u'\u0e37', '\uf705': u'\u0e48', '\uf706': u'\u0e49', '\uf70a': u'\u0e48', '\uf70b': u'\u0e49', '\uf70e': u'\u0e4c', '\uf710': u'\u0e31', '\uf712': u'\u0e47', '\uf713': u'\u0e48', '\uf714': u'\u0e49' } # Correct Thai Sprcial Character Error special_char_replace = dict( (re.escape(k), v) for k, v in special_char_replace.items()) pattern = re.compile("|".join(special_char_replace.keys())) text = pattern.sub(lambda m: special_char_replace[re.escape(m.group(0))], raw_text) total_vaccinations_regex = r"ผู้ที่ได้รับวัคซีนสะสม .{1,100} ทั้งหมด[^\d]+([\d,]+) โดส" total_vaccinations = re.search(total_vaccinations_regex, text).group(1) total_vaccinations = clean_count(total_vaccinations) people_vaccinated_regex = r"ผู้ได้รับวัคซีนเข็มที่ 1 .{1,3}นวน[^\d]+([\d,]+) ร.{1,3}ย" people_vaccinated = re.search(people_vaccinated_regex, text).group(1) people_vaccinated = clean_count(people_vaccinated) people_fully_vaccinated_regex = ( r"นวนผู้ได้รับวัคซีนครบต.{1,2}มเกณฑ์ \(ได้รับวัคซีน 2 เข็ม\) .{1,3}นวน[^\d]+([\d,]+)" ) people_fully_vaccinated = re.search(people_fully_vaccinated_regex, text).group(1) people_fully_vaccinated = clean_count(people_fully_vaccinated) thai_date_regex = r"\( ข้อมูล ณ วันที่ (.{1,30}) เวล(.{1,3}) (.{1,10}) น. \)" thai_date = re.search(thai_date_regex, text).group(1).replace("ำ", "า") thai_date_replace = { "มกราคม": "January", "กุมภาพันธ์": "February", "มีนาคม": "March", "เมษายน": "April", "พฤษภาคม": "May", "มิถุนายน": "June", "กรกฎาคม": "July", "สิงหาคม": "August", "กันยายน": "September", "ตุลาคม": "October", "พฤศจิกายน": "November", "ธันวาคม": "December", "2563": "2020", "2564": "2021", "2565": "2022", "2566": "2023", "2567": "2024" } # Replace Thai Date Format with Standard Date Time Format thai_date_replace = dict( (re.escape(k), v) for k, v in thai_date_replace.items()) pattern = re.compile("|".join(thai_date_replace.keys())) date = pattern.sub(lambda m: thai_date_replace[re.escape(m.group(0))], thai_date) date = clean_date(date, "%d %B %Y") return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, "source_url": latest_report_link, })
def parse_date(soup) -> str: date_raw = soup.find(class_="download").text regex = r"(\d{4})\sCOVID-19疫苗日報表" date_str = re.search(regex, date_raw).group(1) date_str = clean_date("2021" + date_str, "%Y%m%d") return date_str
def parse_date(df: pd.DataFrame) -> str: date = re.search(r"Dati aggiornati al (\d{2}/\d{2}/\d{4})", df).group(1) return clean_date(date, "%d/%m/%Y")
def parse_date(df: dict) -> str: date = df["Unnamed: 1"].str.replace("Journée du ", "").values[0] date = clean_date(date, "%d.%m.%Y") return date
def parse_date(soup: BeautifulSoup) -> str: date = re.search(r"Data applies to: Week ending (\d[\w\s]+\d{4})", soup.text).group(1) date = str(date) return clean_date(date, "%d %B %Y")
def parse_date(soup: BeautifulSoup) -> str: date = soup.find(class_="field-name-post-date").text date = clean_date(date, "%d.%m.%Y") return date