Python clean_date 예제들, vax.utils.dates.clean_date Python 예제들

예제 #1

0

파일 보기

 def _propose_df(self):
     regex_1 = (
         r"COVID-19 Vaccination Update:\n\n1st and second dose — (([a-zA-Z]+) (\d{1,2})(?:th|nd|rd|st) (202\d)), in 36 States \+ the FCT\. \n\n([0-9,]+) eligible "
         r"Nigerians have been vaccinated with first dose while ([0-9,]+) of Nigerians vaccinated with 1st dose have collected their 2nd dose\."
     )
     regex_2 = r"COVID-19 Vaccination Update for (([a-zA-Z]+) (\d{1,2})(?:th|nd|rd|st),? (202\d)), in 36 States \+ the FCT\. "
     regex_3 = r"COVID-19 Vaccination Update"
     data = []
     for tweet in self.tweets:
         match_1 = re.search(regex_1, tweet.full_text)
         match_2 = re.search(regex_2, tweet.full_text)
         match_3 = re.search(regex_3, tweet.full_text)
         if match_1:
             people_vaccinated = clean_count(match_1.group(5))
             people_fully_vaccinated = clean_count(match_1.group(6))
             dt = clean_date(" ".join(match_1.group(2, 3, 4)), "%B %d %Y")
             if self.stop_search(dt):
                 break
             data.append({
                 "date":
                 dt,
                 "total_vaccinations":
                 people_vaccinated + people_fully_vaccinated,
                 "people_vaccinated":
                 people_vaccinated,
                 "people_fully_vaccinated":
                 people_fully_vaccinated,
                 "text":
                 tweet.full_text,
                 "source_url":
                 self.build_post_url(tweet.id),
                 "media_url":
                 tweet.extended_entities["media"][0]["media_url_https"],
             })
         elif match_2:
             dt = clean_date(" ".join(match_2.group(2, 3, 4)), "%B %d %Y")
             if self.stop_search(dt):
                 break
             data.append({
                 "date":
                 dt,
                 "text":
                 tweet.full_text,
                 "source_url":
                 self.build_post_url(tweet.id),
                 "media_url":
                 tweet.extended_entities["media"][0]["media_url_https"],
             })
         elif match_3:
             data.append({
                 "text":
                 tweet.full_text,
                 "source_url":
                 self.build_post_url(tweet.id),
                 "media_url":
                 tweet.extended_entities["media"][0]["media_url_https"],
             })
     df = pd.DataFrame(data)
     return df

예제 #2

0

파일 보기

 def parse_data(self, soup: BeautifulSoup) -> pd.Series:
     # Get path to newest pdf
     pdf_path = self._parse_last_pdf_link(soup)
     # Get text from pdf
     text = self._extract_text_from_pdf(pdf_path)
     # Get vaccine table from text
     df_vax = self._parse_vaccines_table_as_df(text)
     people_vaccinated = df_vax.doses_1.sum()
     people_fully_vaccinated = df_vax.doses_2.sum()
     total_vaccinations = people_vaccinated + people_fully_vaccinated
     vaccine = ", ".join(df_vax.vaccine.map(vaccines_mapping))
     # Get date
     regex = r"Situation Report\s+([\d\.]{10})"
     date = re.search(regex, text).group(1)
     date = clean_date(date, "%d.%m.%Y")
     # Build data series
     return pd.Series(
         data={
             "total_vaccinations": total_vaccinations,
             "people_vaccinated": people_vaccinated,
             "people_fully_vaccinated": people_fully_vaccinated,
             "date": date,
             "source_url": pdf_path,
             "vaccine": vaccine,
             "location": self.location,
         })

예제 #3

0

파일 보기

def read(source: str) -> pd.Series:
    op = Options()
    op.add_argument("--headless")

    with webdriver.Chrome(options=op) as driver:
        driver.get(source)
        time.sleep(3)

        for h5 in driver.find_elements_by_tag_name("h5"):

            if "Primera dosis" in h5.text:
                people_vaccinated = clean_count(
                    h5.find_element_by_xpath("./preceding-sibling::div").text)

            elif "Total dosis aplicadas" in h5.text:
                total_vaccinations = clean_count(
                    h5.find_element_by_xpath("./preceding-sibling::div").text)

            elif "Población completamente vacunada" in h5.text:
                people_fully_vaccinated = clean_count(
                    h5.find_element_by_xpath("./preceding-sibling::div").text)

            elif "Acumulados al" in h5.text:
                date = clean_date(h5.text, "Acumulados al %d de %B de %Y",
                                  "es")

    data = {
        "date": date,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "total_vaccinations": total_vaccinations,
    }
    return pd.Series(data=data)

예제 #4

0

파일 보기

파일: ukraine.py 프로젝트: ziarelli/covid-19-data

def connect_parse_data(source: str) -> pd.Series:
    op = Options()
    op.add_argument("--headless")

    with webdriver.Chrome(options=op) as driver:
        driver.get(source)
        time.sleep(10)

        date = driver.find_element_by_class_name(
            "as_of").find_element_by_tag_name("span").text
        date = clean_date(date, "%d.%m.%Y")

        for elem in driver.find_elements_by_class_name("counter_block"):
            if "1 ДОЗУ" in elem.text:
                people_vaccinated = elem.find_element_by_tag_name("h2").text
            if "2 ДОЗИ" in elem.text:
                people_fully_vaccinated = elem.find_element_by_tag_name(
                    "h2").text

    data = {
        "people_vaccinated": clean_count(people_vaccinated),
        "people_fully_vaccinated": clean_count(people_fully_vaccinated),
        "date": date,
    }
    return pd.Series(data=data)

예제 #5

0

파일 보기

파일: qatar.py 프로젝트: Nanashi06/covid-19-data

def connect_parse_data(source: str, source_old: str) -> pd.Series:
    op = Options()
    op.add_argument("--headless")

    with webdriver.Chrome(options=op) as driver:
        driver.get(source)
        time.sleep(5)

        total_vaccinations = driver.find_element_by_id("counter1").text
        people_vaccinated = driver.find_element_by_id("counter2").text
        people_fully_vaccinated = driver.find_element_by_id("counter3").text

        driver.get(source_old)
        time.sleep(5)

        # Sanity check
        total_vaccinations_old = driver.find_element_by_id("counter1").text
        if total_vaccinations != total_vaccinations_old:
            raise ValueError(
                "Both dashboards may not be synced and hence may refer to different timestamps. Consider"
                "Introducing the timestamp manually.")
        date = driver.find_element_by_id("pupdateddate").text
        date = clean_date(date, "Updated %d %b, %Y")

    data = {
        "total_vaccinations": clean_count(total_vaccinations),
        "people_vaccinated": clean_count(people_vaccinated),
        "people_fully_vaccinated": clean_count(people_fully_vaccinated),
        "date": date,
    }
    return pd.Series(data=data)

예제 #6

0

파일 보기

 def _propose_df(self):
     regex = r"COVID-19 update: As at (\d{1,2} [a-zA-Z]+ 202\d), .* a total of ([\d ]+) people have been vaccinated"
     data = []
     for tweet in self.tweets:
         match = re.search(regex, tweet.full_text)
         if match:
             dt = clean_date(match.group(1), "%d %B %Y")
             total_vaccinations = clean_count(match.group(2))
             dt = tweet.created_at.strftime("%Y-%m-%d")
             if self.stop_search(dt):
                 break
             data.append({
                 "date":
                 dt,
                 "people_vaccinated":
                 total_vaccinations,
                 "text":
                 tweet.full_text,
                 "source_url":
                 self.build_post_url(tweet.id),
                 "media_url":
                 tweet.entities["media"][0]["media_url_https"]
                 if "media" in tweet.entities else None,
             })
     return pd.DataFrame(data)

예제 #7

0

파일 보기

    def _parse_data(self, worksheet):

        for row in worksheet.values():
            for value in row:
                if "Total dosis aplicadas al " in str(value):
                    total_vaccinations = row[-1]
                    date_raw = re.search(r"[\d-]{10}$", value).group(0)
                    date_str = clean_date(date_raw, "%d-%m-%Y")
                elif value == "Esquemas completos segundas + únicas dosis":
                    people_fully_vaccinated = row[-1]
                elif value == "Total únicas dosis acumuladas":
                    unique_doses = row[-1]

        if total_vaccinations is None or people_fully_vaccinated is None:
            raise ValueError(
                "Date is not where it is expected be! Check worksheet")
        return pd.Series({
            "date":
            date_str,
            "total_vaccinations":
            total_vaccinations,
            "people_fully_vaccinated":
            people_fully_vaccinated,
            "people_vaccinated":
            total_vaccinations - people_fully_vaccinated + unique_doses,
        })

예제 #8

0

파일 보기

파일: singapore.py 프로젝트: ziarelli/covid-19-data

def parse_date(soup: BeautifulSoup) -> str:
    for h3 in soup.find_all("h3"):
        if "Vaccination Data" in h3.text:
            break
    date = re.search(r"as of (\d+ \w+ \d+)", h3.text).group(1)
    date = clean_date(date, "%d %b %Y")
    return date

예제 #9

0

파일 보기

def parse_date(filename):
    # Read pdf (for date)
    with open(filename, mode="rb") as f:
        reader = PyPDF2.PdfFileReader(f)
        page = reader.getPage(0)
        text = page.extractText()
    # Get date
    date_str = re.search(r"\n(?P<count>\d{1,2}.\d{1,2}.\d{4})\n", text).group(1)
    return clean_date(date_str, "%d.%m.%Y")

예제 #10

0

파일 보기

 def read(self) -> pd.Series:
     data = self._parse_data()
     # Build Series
     return pd.Series({
         "total_vaccinations": data["Doses_Administered"],
         "people_vaccinated": data["Administered_Dose1_Recip"],
         "people_fully_vaccinated": data["Series_Complete_Yes"],
         "date": clean_date(data["Date"], "%Y-%m-%d"),
         "vaccine": self._parse_vaccines(data)
     })

예제 #11

0

파일 보기

 def _parse_date(self, soup: BeautifulSoup) -> str:
     elems = soup.find_all("p")
     x = []
     for elem in elems:
         if elem.find(text=re.compile(self.regex["date"])):
             x.append(elem)
     if len(x) > 1:
         raise ValueError("Format of source has changed")
     date_str = clean_date(x[0].text, "ажурирано %d.%m.%Y")
     return date_str

예제 #12

0

파일 보기

 def read(self):
     data = requests.get(
         self.source_url).json()["features"][0]["attributes"]
     return pd.Series({
         "total_vaccinations":
         data["Vaccine_total"],
         "people_fully_vaccinated":
         data["Vaccine_total_last24"],
         "date":
         clean_date(datetime.fromtimestamp(data["Date"] / 1000)),
     })

예제 #13

0

파일 보기

 def _parse_date(self, driver):
     driver.find_element_by_id("tabZoneId87").click()
     time.sleep(1)
     driver.find_element_by_id("download-ToolbarButton").click()
     time.sleep(2)
     driver.find_element_by_xpath(
         f"//button[contains(text(),'Data')]").click()
     time.sleep(4)
     window_after = driver.window_handles[1]
     driver.switch_to.window(window_after)
     time.sleep(2)
     date_str = driver.find_element_by_tag_name("tbody").text
     return clean_date(date_str, "%m/%d/%Y")

예제 #14

0

파일 보기

def parse_data(data: dict) -> pd.Series:

    date = clean_date(data["updated"], "%Y/%m/%d")

    people_vaccinated = data["progress"]
    people_fully_vaccinated = data["completed"]

    return pd.Series(
        data={
            "date": date,
            "people_vaccinated": people_vaccinated,
            "people_fully_vaccinated": people_fully_vaccinated,
            "vaccine": ", ".join(_get_vaccine_names(data, translate=True)),
        })

예제 #15

0

파일 보기

def parse_data(soup: BeautifulSoup) -> pd.Series:

    numbers = soup.find_all(class_="odometer")

    date = re.search(r"[\d\.]{10}", soup.find(class_="counter").text).group(0)
    date = clean_date(date, "%d.%m.%Y")

    return pd.Series(
        data={
            "total_vaccinations": int(numbers[0]["data-count"]),
            "people_vaccinated": int(numbers[1]["data-count"]),
            "people_fully_vaccinated": int(numbers[2]["data-count"]),
            "date": date
        })

예제 #16

0

파일 보기

def read(source: str) -> pd.Series:
    data = requests.get(source).json()

    people_vaccinated = data["topBlock"]["vaccination"]["tot_dose_1"]
    people_fully_vaccinated = data["topBlock"]["vaccination"]["tot_dose_2"]
    total_vaccinations = data["topBlock"]["vaccination"]["total_doses"]
    date = clean_date(data["timestamp"], "%Y-%m-%d %H:%M:%S")

    return pd.Series({
        "date": date,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "total_vaccinations": total_vaccinations,
    })

예제 #17

0

파일 보기

파일: gambia.py 프로젝트: HenkPoley/covid-19-data

 def parse_data_pdf(self, link) -> dict:
     text = self._get_pdf_text(link)
     regex = (
         r"([\d,]+) people have been vaccinated against COVID-19 as of (\d{1,2})(?:th|nd|st|rd) ([a-zA-Z]+) (202\d)"
     )
     match = re.search(regex, text)
     people_vaccinated = clean_count(match.group(1))
     date_raw = " ".join(match.group(2, 3, 4))
     date_str = clean_date(date_raw, "%d %B %Y")
     return {
         "total_vaccinations": people_vaccinated,
         "people_vaccinated": people_vaccinated,
         "people_fully_vaccinated": 0,
         "date": date_str,
         "source_url": link,
     }

예제 #18

0

파일 보기

 def parse_data(self, soup: BeautifulSoup) -> pd.Series:
     data = {}
     match = re.search(self.regex["title"], soup.text)
     if match:
         # date
         date_str = match.group(1)
         data["date"] = clean_date(f"{date_str} {datetime.now().year}",
                                   "%d de %B %Y",
                                   lang="es")
         # vaccinations
         data["total_vaccinations"] = clean_count(match.group(2))
     match = re.search(self.regex["data"], soup.text)
     if match:
         data["people_vaccinated"] = clean_count(match.group(1))
         data["people_fully_vaccinated"] = clean_count(match.group(3))
     return pd.Series(data)

예제 #19

0

파일 보기

파일: luxembourg.py 프로젝트: ziarelli/covid-19-data

def parse_date(df: dict) -> str:
    # Old
    colnames = df.loc[0]
    date = df.loc[0, "Unnamed: 1"].replace("Journée du ", "")
    # New
    # colnames = df.columns
    # date = df.columns.str.replace("Journée du ", "").values[0]
    _ = [
        re.search(r"Journée du (\d{1,2}.\d{1,2}.\d{4})", col)
        for col in colnames.astype(str)
    ]
    col = [col for col in _ if col is not None]
    if len(col) != 1:
        raise ValueError("Something changed in the columns!")
    date = clean_date(col[0].group(1), "%d.%m.%Y")
    return date

예제 #20

0

파일 보기

 def _propose_df(self):
     regex = r"COVID-19 : Vaccination Updates\n\n(\d{1,2}\.\d{1,2}\.202\d).*"
     data = []
     for tweet in self.tweets:
         match = re.search(regex, tweet.full_text)
         if match:
             dt = clean_date(match.group(1), "%d.%m.%Y")
             if self.stop_search(dt):
                 break
             data.append({
                 "date": dt,
                 "text": tweet.full_text,
                 "source_url": self.build_post_url(tweet.id),
                 "media_url": tweet.entities["media"][0]["media_url_https"] if "media" in tweet.entities else None,
             })
     return pd.DataFrame(data)

예제 #21

0

파일 보기

 def _propose_df(self):
     regex = r"Recevez la situation .* au (\d{1,2} [a-z]+ 202\d)\."
     data = []
     for tweet in self.tweets:
         match = re.search(regex, tweet.full_text)
         if match:
             dt = clean_date(match.group(1), "%d %B %Y", lang="fr")
             if self.stop_search(dt):
                 break
             data.append({
                 "date": dt,
                 "text": tweet.full_text,
                 "source_url": self.build_post_url(tweet.id),
                 "media_url": tweet.entities["media"][0]["media_url_https"] if "media" in tweet.entities else None,
             })
     return pd.DataFrame(data)

예제 #22

0

파일 보기

파일: costa_rica.py 프로젝트: ziarelli/covid-19-data

def parse_data(soup: BeautifulSoup) -> pd.Series:

    total_vaccinations = int(soup.find_all(class_="counter")[0].text)
    people_vaccinated = int(soup.find_all(class_="counter")[1].text)
    people_fully_vaccinated = int(soup.find_all(class_="counter")[2].text)
    assert total_vaccinations >= people_vaccinated
    assert people_vaccinated >= people_fully_vaccinated

    date = soup.find(class_="fuente").text
    date = re.search(r"\d{2}-\d{2}-\d{4}", date).group(0)
    date = clean_date(date, "%d-%m-%Y")

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
    }
    return pd.Series(data=data)

예제 #23

0

파일 보기

def parse_data(soup: BeautifulSoup) -> pd.Series:

    people_vaccinated = int(soup.find_all(class_="count")[0]["data-count"])
    people_fully_vaccinated = int(
        soup.find_all(class_="count")[1]["data-count"])
    assert people_vaccinated >= people_fully_vaccinated
    total_vaccinations = people_vaccinated + people_fully_vaccinated

    date = soup.find(class_="reportdate").text
    date = re.search(r"\d+ \w+ 202\d", date).group(0)
    date = clean_date(date, "%d %b %Y")

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
    }
    return pd.Series(data=data)

예제 #24

0

파일 보기

def parse_data(soup: BeautifulSoup) -> pd.Series:

    # Get path to newest pdf
    links = soup.find(class_="rt-article").find_all("a")
    for link in links:
        if "sitrep-sl-en" in link["href"]:
            pdf_path = "https://www.epid.gov.lk" + link["href"]
            break

    tf = tempfile.NamedTemporaryFile()

    with open(tf.name, mode="wb") as f:
        f.write(requests.get(pdf_path).content)

    with open(tf.name, mode="rb") as f:
        reader = PyPDF2.PdfFileReader(f)
        page = reader.getPage(0)
        text = page.extractText().replace("\n", "")

    covishield_data = re.search(r"Covishield Vaccine +(\d+) (\d+)", text)
    covishield_dose1 = clean_count(covishield_data.group(1))
    covishield_dose2 = clean_count(covishield_data.group(2))

    sinopharm_data = re.search(r"Sinopharm Vaccine +(\d+) (\d+)", text)
    sinopharm_dose1 = clean_count(sinopharm_data.group(1))
    sinopharm_dose2 = clean_count(sinopharm_data.group(2))

    total_vaccinations = covishield_dose1 + covishield_dose2 + sinopharm_dose1 + sinopharm_dose2
    people_vaccinated = covishield_dose1 + sinopharm_dose1
    people_fully_vaccinated = covishield_dose2 + sinopharm_dose2

    regex = r"Situation Report\s+([\d\.]{10})"
    date = re.search(regex, text).group(1)
    date = clean_date(date, "%d.%m.%Y")

    return pd.Series(
        data={
            "total_vaccinations": total_vaccinations,
            "people_vaccinated": people_vaccinated,
            "people_fully_vaccinated": people_fully_vaccinated,
            "date": date,
            "source_url": pdf_path,
        })

예제 #25

0

파일 보기

파일: japan.py 프로젝트: ManchesterWuer/covid-19-data

 def _parse_data(self, source: str) -> pd.Series:
     df = self._get_data_raw(source)
     # Parse metrics
     total_vaccinations = df.loc["合計", "接種回数"].item()
     people_vaccinated = df.loc["合計", "内1回目"].sum()
     people_fully_vaccinated = df.loc["合計", "内2回目"].sum()
     # Parse date
     date = clean_date(
         max(dt for dt in df.index.values if isinstance(dt, datetime)))
     # Parse vaccines
     vaccines = self._parse_vaccines(df)
     return pd.Series(
         data={
             "total_vaccinations": total_vaccinations,
             "people_vaccinated": people_vaccinated,
             "people_fully_vaccinated": people_fully_vaccinated,
             "date": date,
             "vaccine": vaccines
         })

예제 #26

0

파일 보기

def read(source: str) -> pd.Series:

    headers = {
        "User-Agent":
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache",
    }
    soup = BeautifulSoup(
        requests.get(source, headers=headers).content, "html.parser")

    text = soup.find("div", id="data").find("p").text

    date = re.search(r"На сегодня \(([\d\.]{8})\)", text).group(1)
    date = clean_date(date, "%d.%m.%y")

    people_vaccinated = re.search(
        r"([\d\s]+) чел\. \([\d\.]+% от населения[^)]*\) - привито хотя бы одним компонентом вакцины",
        text).group(1)
    people_vaccinated = clean_count(people_vaccinated)

    people_fully_vaccinated = re.search(
        r"([\d\s]+) чел\. \([\d\.]+% от населения,?[^)]*\) - полностью привито",
        text).group(1)
    people_fully_vaccinated = clean_count(people_fully_vaccinated)

    total_vaccinations = re.search(r"([\d\s]+) шт\. - всего прививок сделано",
                                   text).group(1)
    total_vaccinations = clean_count(total_vaccinations)

    return pd.Series({
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
    })

예제 #27

0

파일 보기

 def _propose_df(self):
     regex = r"VACUNACIÓN #COVID19 \| Reporte del (\d{1,2}\.\d{1,2}\.202\d) - \d{1,2}:\d{1,2}"
     data = []
     for tweet in self.tweets:
         match = re.search(regex, tweet.full_text)
         if match:
             regex_doses = r"Total Dosis Administradas: ([\d\.]+)"
             total_vaccinations = re.search(regex_doses, tweet.full_text)
             if total_vaccinations:
                 total_vaccinations = clean_count(
                     total_vaccinations.group(1))
             else:
                 total_vaccinations = pd.NA
             regex_people = r"Total personas vacunadas: ([\d\.]+)"
             people_vaccinated = re.search(regex_people, tweet.full_text)
             if people_vaccinated:
                 people_vaccinated = clean_count(people_vaccinated.group(1))
             else:
                 people_vaccinated = pd.NA
             people_fully_vaccinated = total_vaccinations - people_vaccinated
             dt = clean_date(match.group(1), "%d.%m.%Y")
             if self.stop_search(dt):
                 break
             data.append({
                 "date":
                 dt,
                 "total_vaccinations":
                 total_vaccinations,
                 "people_vaccinated":
                 people_vaccinated,
                 "people_fully_vaccinated":
                 people_fully_vaccinated,
                 "text":
                 tweet.full_text,
                 "source_url":
                 1,  #pan.build_post_url(tweet.id),
                 "media_url":
                 tweet.extended_entities["media"][0]["media_url_https"],
             })
     df = pd.DataFrame(data)
     return df

예제 #28

0

파일 보기

def read(source: str) -> pd.Series:
    soup = get_soup(source)

    counters = soup.find_all(class_="counter")
    people_partially_vaccinated = clean_count(counters[0].text)
    people_fully_vaccinated = clean_count(counters[1].text)
    total_vaccinations = clean_count(counters[2].text)
    people_vaccinated = people_partially_vaccinated + people_fully_vaccinated

    date = soup.find("span", id="last-update").text
    date = re.search(r"\d+.*202\d", date).group(0)
    date = clean_date(date, "%d %B, %Y")

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
        "source_url": source,
    }
    return pd.Series(data=data)

예제 #29

0

파일 보기

 def _propose_df(self):
     regex = r"Minister of Health Lizzie Nkosi's #COVID19 update on (\d{1,2} [a-zA-Z]+ 202\d)"
     data = []
     for tweet in self.tweets:
         match = re.search(regex, tweet.full_text)
         if match:
             dt = clean_date(match.group(1), "%d %B %Y")
             if self.stop_search(dt):
                 break
             data.append({
                 "date":
                 dt,
                 "text":
                 tweet.full_text,
                 "source_url":
                 self.build_post_url(tweet.id),
                 "media_url":
                 tweet.extended_entities["media"][0]["media_url_https"],
             })
     df = pd.DataFrame(data)
     return df

예제 #30

0

파일 보기

def connect_parse_data(source: str) -> pd.Series:

    soup = get_soup(source)
    people_vaccinated = soup.find_all(class_="repart-stlucia")[0].text
    people_vaccinated = clean_count(people_vaccinated)

    people_fully_vaccinated = soup.find_all(class_="repart-stlucia")[1].text
    people_fully_vaccinated = clean_count(people_fully_vaccinated)

    total_vaccinations = people_vaccinated + people_fully_vaccinated

    date = soup.find(class_="h2-blue").text
    date = re.search(r"\w+ +\d+, +202\d", date).group(0)
    date = clean_date(date, "%B %d, %Y")

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
    }
    return pd.Series(data=data)