def parse_text(self, soup: BeautifulSoup) -> pd.Series: national_program = r"As of ([\d]+ [A-Za-z]+ 20\d{2}), we have administered a total of ([\d,]+) doses of COVID-19 vaccines under the national vaccination programme \(Pfizer-BioNTech Comirnaty and Moderna\), covering ([\d,]+) individuals" data = re.search(national_program, soup.text).groups() national_date = clean_date(data[0], fmt="%d %B %Y", lang="en_US", loc="en_US") national_doses = clean_count(data[1]) national_people_vaccinated = clean_count(data[2]) who_eul = r"In addition, ([\d,]+) doses of other vaccines recognised in the World Health Organization.s Emergency Use Listing \(WHO EUL\) have been administered as of ([\d]+ [A-Za-z]+ 20\d{2}), covering ([\d,]+) individuals\. In total, (\d+)% of our population has completed their full regimen/ received two doses of COVID-19 vaccines, and (\d+)% has received at least one dose" data = re.search(who_eul, soup.text).groups() who_doses = clean_count(data[0]) who_date = clean_date(data[1], fmt="%d %B %Y", lang="en_US", loc="en_US") who_people_vaccinated = clean_count(data[2]) share_fully_vaccinated = int(data[3]) share_vaccinated = int(data[4]) date = max([national_date, who_date]) total_vaccinations = national_doses + who_doses people_vaccinated = national_people_vaccinated + who_people_vaccinated people_fully_vaccinated = round( people_vaccinated * (share_fully_vaccinated / share_vaccinated)) data = pd.Series({ "date": date, "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, }) return data
def _propose_df(self): regex = r"COVID-19 update: As at (\d{1,2} [a-zA-Z]+ 202\d), .* a total of ([\d ]+) people have been vaccinated" data = [] for tweet in self.tweets: match = re.search(regex, tweet.full_text) if match: dt = clean_date(match.group(1), "%d %B %Y") total_vaccinations = clean_count(match.group(2)) dt = clean_date(tweet.created_at) if self.stop_search(dt): break data.append({ "date": dt, "people_vaccinated": total_vaccinations, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.entities["media"][0]["media_url_https"] if "media" in tweet.entities else None, }) return pd.DataFrame(data)
def _propose_df(self): regex_1 = ( r"COVID-19 Vaccination Update:\n\n1st and second dose — (([a-zA-Z]+) (\d{1,2})(?:th|nd|rd|st) (202\d)), in 36 States \+ the FCT\. \n\n([0-9,]+) eligible " r"Nigerians have been vaccinated with first dose while ([0-9,]+) of Nigerians vaccinated with 1st dose have collected their 2nd dose\." ) regex_2 = r"COVID-19 Vaccination Update for (([a-zA-Z]+) (\d{1,2})(?:th|nd|rd|st),? (202\d)), in 36 States \+ the FCT\. " regex_3 = r"COVID-19 Vaccination Update" data = [] for tweet in self.tweets: match_1 = re.search(regex_1, tweet.full_text) match_2 = re.search(regex_2, tweet.full_text) match_3 = re.search(regex_3, tweet.full_text) if match_1: people_vaccinated = clean_count(match_1.group(5)) people_fully_vaccinated = clean_count(match_1.group(6)) dt = clean_date(" ".join(match_1.group(2, 3, 4)), "%B %d %Y") if self.stop_search(dt): break data.append( { "date": dt, "total_vaccinations": people_vaccinated + people_fully_vaccinated, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.extended_entities["media"][0][ "media_url_https" ], } ) elif match_2: dt = clean_date(" ".join(match_2.group(2, 3, 4)), "%B %d %Y") if self.stop_search(dt): break data.append( { "date": dt, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.extended_entities["media"][0][ "media_url_https" ], } ) elif match_3: data.append( { "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.extended_entities["media"][0][ "media_url_https" ], } ) df = pd.DataFrame(data) return df
def _propose_df(self): max_iter = 30 dist_th = 8.7 col_dominant = [160, 194, 195] records = [] for tweet in self.tweets[:max_iter]: cond = "media" in tweet.entities # and len(tweet.full_text) < 30 if cond: url = tweet.extended_entities["media"][0]["media_url_https"] im = Image.open(requests.get(url, stream=True).raw, formats=["jpeg"]) pixel_values = [x for i, x in enumerate(im.getdata()) if i < 100000] h = pd.value_counts(pixel_values, normalize=True).index[0] dist = np.linalg.norm(np.array(h) - np.array(col_dominant)) if dist < dist_th: dt = clean_date(tweet.created_at) if self.stop_search(dt): break records.append( { "date": dt, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": url, } ) df = pd.DataFrame(records) return df
def _parse_data(self, worksheet): for row in worksheet.values(): for value in row: if "Total dosis aplicadas al " in str(value): total_vaccinations = row[-1] if type(total_vaccinations) != int: total_vaccinations = clean_count(total_vaccinations) date_raw = re.search(r"[\d-]{10}$", value).group(0) date_str = clean_date(date_raw, "%d-%m-%Y") elif value == "Esquemas completos segundas + únicas dosis": people_fully_vaccinated = row[-1] if type(people_fully_vaccinated) != int: people_fully_vaccinated = clean_count(people_fully_vaccinated) elif value == "Total únicas dosis acumuladas": unique_doses = row[-1] if type(unique_doses) != int: unique_doses = clean_count(unique_doses) if total_vaccinations is None or people_fully_vaccinated is None: raise ValueError("Date is not where it is expected be! Check worksheet") return pd.Series( { "date": date_str, "total_vaccinations": total_vaccinations, "people_fully_vaccinated": people_fully_vaccinated, "people_vaccinated": total_vaccinations - people_fully_vaccinated + unique_doses, } )
def read(source: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(3) for h5 in driver.find_elements_by_tag_name("h5"): if "Primera dosis" in h5.text: people_vaccinated = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Total dosis aplicadas" in h5.text: total_vaccinations = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Población completamente vacunada" in h5.text: people_fully_vaccinated = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Acumulados al" in h5.text: date = clean_date(h5.text, "Acumulados al %d de %B de %Y", "es") data = { "date": date, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_vaccinations": total_vaccinations, } return pd.Series(data=data)
def _propose_df(self): regex_1 = r"Results of COVID-19 tests .*" regex_2 = r"against COVID-19: ([\d,]+)" data = [] for tweet in self.tweets: dt = clean_date(tweet.created_at) if self.stop_search(dt): break if re.search(regex_1, tweet.full_text): if "media" in tweet.entities: data.append({ "date": dt, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.entities["media"][0]["media_url_https"], }) elif re.search(regex_2, tweet.full_text): total_vaccinations = re.search(regex_2, tweet.full_text).group(1) data.append({ "date": dt, "total_vaccinations": clean_count(total_vaccinations), "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), }) return pd.DataFrame(data)
def parse_data(last_update: str, max_iter: int = 10): records = [] for days in range(10): date_it = clean_date(datetime.now() - timedelta(days=days)) # print(date_it) # print(f"{date_it} > {last_update}?") if date_it > last_update: source = _get_source_url(date_it.replace("-", "")) try: df_ = pd.read_excel(source, index_col=0) except HTTPError: print("No available!") else: # print("Adding!") _check_vaccine_names(df_) ds = _parse_ds_data(df_, source) records.append(ds) else: # print("End!") break # print(max_iter) if len(records) > 0: return pd.DataFrame(records) # print("No data being added to Spain") return None
def connect_parse_data(source: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(10) date = (driver.find_element_by_class_name( "as_of").find_element_by_tag_name("span").text) date = clean_date(date, "%d.%m.%Y") for elem in driver.find_elements_by_class_name("counter_block"): if "1 ДОЗУ" in elem.text: people_vaccinated = elem.find_element_by_tag_name("h2").text if "2 ДОЗИ" in elem.text: people_fully_vaccinated = elem.find_element_by_tag_name( "h2").text data = { "people_vaccinated": clean_count(people_vaccinated), "people_fully_vaccinated": clean_count(people_fully_vaccinated), "date": date, } return pd.Series(data=data)
def _parse_date(self, text: str): thai_date_replace = { # Months "มกราคม": 1, "กุมภาพันธ์": 2, "มีนาคม": 3, "เมษายน": 4, "พฤษภาคม": 5, "พฤษภำคม": 5, "มิถุนายน": 6, "มิถุนำยน": 6, "กรกฎาคม": 7, "กรกฎำคม": 7, "สิงหาคม": 8, "สิงหำคม": 8, "กันยายน": 9, "ตุลาคม": 10, "พฤศจิกายน": 11, "ธันวาคม": 12, } date_raw = re.search(self.regex_date, text) day = clean_count(date_raw.group(1)) month = thai_date_replace[date_raw.group(2)] year = clean_count(date_raw.group(3)) - 543 return clean_date(datetime(year, month, day))
def parse_data(self, soup: BeautifulSoup) -> pd.Series: # Get path to newest pdf pdf_path = self._parse_last_pdf_link(soup) # Get text from pdf text = self._extract_text_from_pdf(pdf_path) # Get vaccine table from text df_vax = self._parse_vaccines_table_as_df(text) people_vaccinated = df_vax.doses_1.sum() people_fully_vaccinated = df_vax.doses_2.sum() total_vaccinations = people_vaccinated + people_fully_vaccinated vaccine = ", ".join(df_vax.vaccine.map(vaccines_mapping)) # Get date regex = r"Situation Report\s+([\d\.]{10})" date = re.search(regex, text).group(1) date = clean_date(date, "%d.%m.%Y") # Build data series return { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, "source_url": pdf_path, "vaccine": vaccine, "location": self.location, }
def read(self): data = requests.get(self.source_url).json()["features"][0]["attributes"] return pd.Series( { "total_vaccinations": data["Vaccine_total"], "people_fully_vaccinated": data["Vaccine_total_last24"], "date": clean_date(datetime.fromtimestamp(data["Date"] / 1000)), } )
def parse_date(filename): # Read pdf (for date) with open(filename, mode="rb") as f: reader = PyPDF2.PdfFileReader(f) page = reader.getPage(0) text = page.extractText() # Get date date_str = re.search(r"\n(?P<count>\d{1,2}.\d{1,2}.\d{4})\n", text).group(1) return clean_date(date_str, "%d.%m.%Y")
def _parse_date(self, soup: BeautifulSoup) -> str: elems = soup.find_all("p") x = [] for elem in elems: if elem.find(text=re.compile(self.regex["date"])): x.append(elem) if len(x) > 1: raise ValueError("Format of source has changed") date_str = clean_date(x[0].text, "ажурирано %d.%m.%Y") return date_str
def read(self) -> pd.Series: data = self._parse_data() # Build Series return pd.Series( { "total_vaccinations": data["Doses_Administered"], "people_vaccinated": data["Administered_Dose1_Recip"], "people_fully_vaccinated": data["Series_Complete_Yes"], "date": clean_date(data["Date"], "%Y-%m-%d"), "vaccine": self._parse_vaccines(data), } )
def read(source: str) -> pd.Series: soup = get_soup(source) counters = soup.find_all(class_="text-brand-blue") dose_1 = clean_count(counters[1].text) dose_2 = clean_count(counters[2].text) assert dose_1 >= dose_2 date = soup.find(class_="text-gray-500").text date = date.replace("Updated ", "") + str(datetime.date.today().year) date = clean_date(date, fmt="%d. %B%Y", lang="en") return pd.Series({"people_vaccinated": dose_1, "people_fully_vaccinated": dose_2, "date": date})
def parse_data(data: dict) -> pd.Series: date = clean_date(data["updated"], "%Y/%m/%d") people_vaccinated = data["progress"] people_fully_vaccinated = data["completed"] return pd.Series( data={ "date": date, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "vaccine": ", ".join(_get_vaccine_names(data, translate=True)), } )
def parse_data(soup: BeautifulSoup) -> pd.Series: numbers = soup.find_all(class_="odometer") date = re.search(r"[\d\.]{10}", soup.find(class_="counter").text).group(0) date = clean_date(date, "%d.%m.%Y") return pd.Series( data={ "total_vaccinations": int(numbers[0]["data-count"]), "people_vaccinated": int(numbers[1]["data-count"]), "people_fully_vaccinated": int(numbers[2]["data-count"]), "date": date, } )
def parse_data(self, soup: BeautifulSoup) -> pd.Series: data = {} match = re.search(self.regex["title"], soup.text) if match: # date date_str = match.group(1) data["date"] = clean_date( f"{date_str} {datetime.now().year}", "%d de %B %Y", lang="es" ) # vaccinations data["total_vaccinations"] = clean_count(match.group(2)) match = re.search(self.regex["data"], soup.text) if match: data["people_vaccinated"] = clean_count(match.group(1)) data["people_fully_vaccinated"] = clean_count(match.group(3)) return pd.Series(data)
def parse_data(soup: BeautifulSoup) -> pd.Series: people_vaccinated = clean_count(soup.find_all(class_="counter")[3].text) people_fully_vaccinated = clean_count( soup.find_all(class_="counter")[4].text) assert people_vaccinated >= people_fully_vaccinated total_vaccinations = people_vaccinated + people_fully_vaccinated date = soup.find(class_="actualiza").text date = re.search(r"\d{2}-\d{2}-\d{4}", date).group(0) date = clean_date(date, "%d-%m-%Y") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)
def read(source: str) -> pd.Series: headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Pragma": "no-cache", "Cache-Control": "no-cache", } soup = BeautifulSoup(requests.get(source, headers=headers).content, "html.parser") text = soup.find("div", id="data").find("p").text date = re.search(r"На сегодня \(([\d\.]{8})\)", text).group(1) date = clean_date(date, "%d.%m.%y") people_vaccinated = re.search( r"([\d\s]+) чел\. \([\d\.]+% от населения[^)]*\) - привито хотя бы одним компонентом вакцины", text, ).group(1) people_vaccinated = clean_count(people_vaccinated) people_fully_vaccinated = re.search( r"([\d\s]+) чел\. \([\d\.]+% от населения,?[^)]*\) - полностью привито", text ).group(1) people_fully_vaccinated = clean_count(people_fully_vaccinated) total_vaccinations = re.search( r"([\d\s]+) шт\. - всего прививок сделано", text ).group(1) total_vaccinations = clean_count(total_vaccinations) return pd.Series( { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } )
def _propose_df(self): regex = r"ባለፉት 24 .*" data = [] for tweet in self.tweets: if re.search(regex, tweet.full_text): dt = clean_date(tweet.created_at) if self.stop_search(dt): break data.append({ "date": dt, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.extended_entities["media"][1]["media_url_https"], }) return pd.DataFrame(data)
def _propose_df(self): regex = r"Recevez la situation .* au (\d{1,2} [a-z]+ 202\d)\." data = [] for tweet in self.tweets: match = re.search(regex, tweet.full_text) if match: dt = clean_date(match.group(1), "%d %B %Y", lang="fr") if self.stop_search(dt): break data.append( { "date": dt, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.entities["media"][0]["media_url_https"] if "media" in tweet.entities else None, } ) return pd.DataFrame(data)
def _propose_df(self): regex = r"Comunicado N° (\d{3,4}).*" data = [] for tweet in self.tweets: match = re.search(regex, tweet.full_text) if match: dt = clean_date( from_tz_to_tz(tweet.created_at, to_tz="America/Panama")) if self.stop_search(dt): break data.append({ "date": dt, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "num": match.group(1), }) self.tweets_relevant.append(tweet) df = pd.DataFrame(data) df = df.drop_duplicates(subset=["num"], keep="last") return df
def _propose_df(self): regex = r"COVID-19 : Vaccination Updates\n\n(\d{1,2}\.\d{1,2}\.202\d).*" data = [] for tweet in self.tweets: match = re.search(regex, tweet.full_text) if match: dt = clean_date(match.group(1), "%d.%m.%Y") if self.stop_search(dt): break data.append({ "date": dt, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.entities["media"][0]["media_url_https"] if "media" in tweet.entities else None, }) return pd.DataFrame(data)
def _parse_ds_data(df: pd.DataFrame, source: str) -> pd.Series: df.loc[~df.index.isin(["Sanidad Exterior"]), "Fecha de la última vacuna registrada (2)"].dropna().max() return pd.Series( data={ "total_vaccinations": df.loc["Totales", "Dosis administradas (2)"].item(), "people_vaccinated": df.loc["Totales", "Nº Personas con al menos 1 dosis"].item(), "people_fully_vaccinated": df.loc["Totales", "Nº Personas vacunadas(pauta completada)"].item(), "date": clean_date(df.loc[ ~df.index.isin(["Sanidad Exterior"]), "Fecha de la última vacuna registrada (2)", ].dropna().max()), "source_url": source, "vaccine": ", ".join(_get_vaccine_names(df, translate=True)), })
def _propose_df(self): regex = r"Minister of Health Lizzie Nkosi's #COVID19 update on (\d{1,2} [a-zA-Z]+ 202\d)" data = [] for tweet in self.tweets: match = re.search(regex, tweet.full_text) if match: dt = clean_date(match.group(1), "%d %B %Y") if self.stop_search(dt): break data.append( { "date": dt, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.extended_entities["media"][0][ "media_url_https" ], } ) df = pd.DataFrame(data) return df
def main(paths): data = { "location": "Guatemala", "source_url": "https://gtmvigilanciacovid.shinyapps.io/3869aac0fb95d6baf2c80f19f2da5f98", "vaccine": "Moderna, Oxford/AstraZeneca", } op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.maximize_window() # For maximizing window driver.implicitly_wait(20) # gives an implicit wait for 20 seconds driver.get(data["source_url"]) driver.find_element_by_class_name("fa-syringe").click() date = driver.find_element_by_class_name("logo").text dose1 = (driver.find_element_by_id( "dosisaplicadas1").find_element_by_tag_name("h3").text) dose2 = (driver.find_element_by_id( "dosisaplicadas2").find_element_by_tag_name("h3").text) data["people_vaccinated"] = clean_count(dose1) data["people_fully_vaccinated"] = clean_count(dose2) data["total_vaccinations"] = (data["people_vaccinated"] + data["people_fully_vaccinated"]) date = re.search(r"\d+/\d+/202\d", date).group(0) data["date"] = clean_date(date, "%d/%m/%Y") increment( paths=paths, location=data["location"], total_vaccinations=data["total_vaccinations"], people_vaccinated=data["people_vaccinated"], people_fully_vaccinated=data["people_fully_vaccinated"], date=data["date"], source_url=data["source_url"], vaccine=data["vaccine"], )
def _propose_df(self): regex = r"VACUNACIÓN #COVID19 \| Reporte del (\d{1,2}\.\d{1,2}\.202\d) - \d{1,2}:\d{1,2}" data = [] for tweet in self.tweets: match = re.search(regex, tweet.full_text) if match: regex_doses = r"Total Dosis Administradas: ([\d\.]+)" total_vaccinations = re.search(regex_doses, tweet.full_text) if total_vaccinations: total_vaccinations = clean_count(total_vaccinations.group(1)) else: total_vaccinations = pd.NA regex_people = r"Total personas vacunadas: ([\d\.]+)" people_vaccinated = re.search(regex_people, tweet.full_text) if people_vaccinated: people_vaccinated = clean_count(people_vaccinated.group(1)) else: people_vaccinated = pd.NA people_fully_vaccinated = total_vaccinations - people_vaccinated dt = clean_date(match.group(1), "%d.%m.%Y") if self.stop_search(dt): break data.append( { "date": dt, "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "text": tweet.full_text, "source_url": 1, # pan.build_post_url(tweet.id), "media_url": tweet.extended_entities["media"][0][ "media_url_https" ], } ) df = pd.DataFrame(data) return df
def _propose_df(self): regex = ( r"Trouvez ci-bas les données du \d{1,2} [a-zA-Z]+ et la mise à jour globale à la date du (\d{1,2}-\d{1,2}" r"-202\d)\." ) data = [] for tweet in self.tweets: match = re.search(regex, tweet.full_text) if match: dt = clean_date(match.group(1), "%d-%m-%Y") if self.stop_search(dt): break data.append( { "date": dt, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.extended_entities["media"][0][ "media_url_https" ], } ) df = pd.DataFrame(data) return df