Python get_soup示例，vax.utils.utils.get_soup Python示例

示例#1

0

显示文件

def read(source_daily: str, source_weekly: str) -> pd.DataFrame:
    # Daily
    soup_daily = get_soup(source_daily)
    for div in soup_daily.find_all("div"):
        if div.text == "Vaccine doses administered":
            dose_block = div.parent.findChildren()[1]
            break
    date_daily = parse_date_daily(dose_block)
    total_vaccinations_d = parse_data_daily(dose_block)

    # Weekly
    soup_weekly = get_soup(source_weekly)
    date_weekly = parse_date_weekly(soup_weekly)
    total_vaccinations_w, people_vaccinated, people_fully_vaccinated = parse_data_weekly(
        soup_weekly)

    df = pd.DataFrame.from_records([{
        "date": date_weekly,
        "total_vaccinations": total_vaccinations_w,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "source_url": source_weekly
    }, {
        "date": date_daily,
        "total_vaccinations": total_vaccinations_d,
        "source_url": source_daily
    }])
    return df

示例#2

0

显示文件

 def read(self, last_update: str) -> pd.DataFrame:
     yearly_report_page = get_soup(self.source_url)
     # Get Newest Month Report Page
     monthly_report_link = yearly_report_page.find(
         "div", class_="col-lg-12", id="content-detail").find("a")["href"]
     monthly_report_page = get_soup(monthly_report_link)
     # Get links
     df = self._parse_data(monthly_report_page, last_update)
     return df

示例#3

0

显示文件

文件： el_salvador.py 项目： wenjian80/covid-19-data

def read(source: str) -> pd.Series:
    soup = get_soup(source)
    link = parse_infogram_link(soup)
    soup = get_soup(link)
    infogram_data = parse_infogram_data(soup)
    return pd.Series({
        "date": parse_infogram_date(infogram_data),
        "source_url": source,
        **parse_infogram_vaccinations(infogram_data)
    })

示例#4

0

显示文件

def read(source: str) -> pd.Series:
    soup = get_soup(source)
    link = parse_infogram_link(soup)
    soup = get_soup(link)
    infogram_data = parse_infogram_data(soup)
    return pd.Series({
        "people_vaccinated":
        parse_infogram_people_vaccinated(infogram_data),
        "date":
        parse_infogram_date(infogram_data),
        "source_url":
        source
    })

示例#5

0

显示文件

def read(source: str):
    soup = get_soup(source)
    url = parse_pdf_link(soup, source)
    if not url.endswith(".pdf"):
        raise ValueError(f"File reporting metrics is not a PDF: {url}!")
    ds = pd.Series(parse_data(url))
    return ds

示例#6

0

显示文件

def parse_vaccinations(elem) -> dict:
    # Get news text
    url = elem.find_parent(class_="card").find("a").get("href")
    soup = get_soup(url)
    text = "\n".join([p.text for p in soup.find("article").find_all("p")])

    # Find metrics
    metrics = dict()
    # total_vaccinations = re.search(r"疫苗共有(?P<count>[\d,]*)人次", text)
    total_vaccinations = re.search(r"疫苗劑數為(?P<count>[\d,]*)劑", text)
    # print(total_vaccinations)
    # people_vaccinated = re.search(r"1劑疫苗共有(?P<count>[\d,]*)人次", text)
    people_vaccinated = re.search(r"已接種人數共有(?P<count>[\d,]*)人", text)
    # people_fully_vaccinated = re.search(r"2劑疫苗共有(?P<count>[\d,]*)人次", text)
    people_fully_vaccinated = re.search(r"已完成接種2劑有(?P<count>[\d,]*)人", text)

    if total_vaccinations:
        metrics["total_vaccinations"] = clean_count(
            total_vaccinations.group(1))
    if people_vaccinated:
        metrics["people_vaccinated"] = clean_count(people_vaccinated.group(1))
    if people_fully_vaccinated:
        metrics["people_fully_vaccinated"] = clean_count(
            people_fully_vaccinated.group(1))
    return metrics

示例#7

0

显示文件

 def read(self) -> pd.DataFrame:
     """Load data."""
     soup = utils.get_soup(self.source_url)
     link = self._parse_file_link(soup)
     df = utils.read_xlsx_from_url(link, sheet_name="Date")
     print(link)
     #df_by_age = utils.read_xlsx_from_url(link, sheet_name="Ethnicity Age Gender by dose")
     return df#, df_by_age

示例#8

0

显示文件

文件： taiwan.py 项目： vonsturm/covid-19-data

def read(source: str) -> pd.Series:
    url = f"{source}/Category/Page/9jFXNbCe-sFK9EImRRi2Og"
    soup = get_soup(url)
    url_pdf = parse_pdf_link(source, soup)
    df = parse_table(url_pdf)
    return pd.Series({
        "total_vaccinations": parse_total_vaccinations(df),
        "date": parse_date(df)
    })

示例#9

0

显示文件

 def _parse_pdf_link(self, soup) -> str:
     a = soup.find(class_="download").find("a")
     url_pdf = f"{self.source_url}{a['href']}"
     for i in range(10):
         soup = get_soup(url_pdf)
         a = soup.find(class_="viewer-button")
         if a is not None:
             break
     return f"{self.source_url}{a['href']}"

示例#10

0

显示文件

def read(source: str) -> pd.Series:
    soup = get_soup(source)
    total_vaccinations, people_fully_vaccinated = parse_data(soup)
    return pd.Series({
        "total_vaccinations": total_vaccinations,
        "people_fully_vaccinated": people_fully_vaccinated,
        "source_url": source,
        "date": parse_date(soup)
    })

示例#11

0

显示文件

 def read_1(self):
     soup = get_soup(self.source_url_1)
     dfs = pd.read_html(str(soup), header=0)
     if len(dfs) != 1:
         raise ValueError(
             f"Only one table should be present. {len(dfs)} tables detected."
         )
     df = dfs[0]
     return df

示例#12

0

显示文件

 def read(self) -> pd.Series:
     soup = get_soup(self.source_url)
     total_vaccinations, people_vaccinated, people_fully_vaccinated = self._parse_metrics(soup)
     return pd.Series({
         "total_vaccinations": total_vaccinations,
         "people_vaccinated": people_vaccinated,
         "people_fully_vaccinated": people_fully_vaccinated,
         "source_url": self.source_url,
         "date": self._parse_date(soup)
     })

示例#13

0

显示文件

 def read(self, last_update: str) -> pd.DataFrame:
     data = []
     for cnt in range(0, 5 * self._num_max_pages, 5):
         # print(f"page: {cnt}")
         url = f"{self.source_url}/(offset)/{cnt}/"
         soup = get_soup(url)
         data_, proceed = self.parse_data(soup, last_update)
         data.extend(data_)
         if not proceed:
             break
     return pd.DataFrame(data)

示例#14

0

显示文件

 def read(self) -> pd.Series:
     soup = get_soup(self.source_url)
     people_vaccinated, people_fully_vaccinated = self.parse_vaccinated(soup)
     date_str = self.parse_date(soup)
     data = pd.Series({
         "people_vaccinated": people_vaccinated,
         "people_fully_vaccinated": people_fully_vaccinated,
         "total_vaccinations": people_vaccinated + people_fully_vaccinated,
         "date": self.parse_date(soup)
     })
     return pd.Series(data)

示例#15

0

显示文件

文件： vaccines.py 项目： ziarelli/covid-19-data

    def vaccines_approved(self, location: str = None, original_names: bool = False) -> list:
        """Get list of approved vaccines in a country (or all if None specified).

        Args:
            location (str, optional): Country name. If None, retrieves all approved vaccines. Defaults to None.
            original_names (bool, optional): Set to True to keep vaccine from web. Defaults to False.

        Returns:
            list: Approved vaccines
        """
        if location:
            try:
                url = self.get_country_url(location)
                soup = get_soup(url)
                return self._parse_vaccines_location(soup, original_names)
            except ValueError:
                return None
        else:
            soup = get_soup(self.all_vaccines_url)
            return self._parse_vaccines_all(soup, original_names)

示例#16

0

显示文件

 def parse_metrics(self) -> tuple:
     soup = get_soup(self.source_url)
     elems = soup.find(class_="vacunometro-cifras").find_all("td")
     if len(elems) != 2:
         raise ValueError(
             "Something changed in source layout. More than two elemnts with class='vacunados' were found."
         )
     values = [clean_count(elem.text) for elem in elems]
     dose_1 = max(values)
     dose_2 = min(values)
     return dose_1, dose_2

示例#17

0

显示文件

文件： north_macedonia.py 项目： vonsturm/covid-19-data

def connect_parse_data(source: str) -> pd.Series:

    soup = get_soup(source)

    people_vaccinated = soup.find(class_="count-up").text
    people_vaccinated = clean_count(people_vaccinated)

    total_vaccinations = people_vaccinated

    return pd.Series(data={
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
    })

示例#18

0

显示文件

def read(source: str) -> pd.Series:
    soup = get_soup(source)

    for label in soup.find_all(class_="number-label"):
        if label.text == "Total vaccins administrés":
            container = label.parent.parent

    return pd.Series(
        data={
            "total_vaccinations": parse_total_vaccinations(container),
            "people_vaccinated": parse_people_vaccinated(container),
            "people_fully_vaccinated": parse_people_fully_vaccinated(
                container),
            "source_url": source,
        })

示例#19

0

显示文件

def read(source_daily: str, source_weekly: str) -> pd.DataFrame:
    # Daily
    soup_daily = get_soup(source_daily)
    date_daily = parse_date_daily(soup_daily)
    total_vaccinations_d = parse_data_daily(soup_daily)

    # Weekly
    soup_weekly = get_soup(source_weekly)
    date_weekly = parse_date_weekly(soup_weekly)
    total_vaccinations_w, people_vaccinated, people_fully_vaccinated = parse_data_weekly(
        soup_weekly)

    df = pd.DataFrame.from_records([{
        "date": date_weekly,
        "total_vaccinations": total_vaccinations_w,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "source_url": source_weekly
    }, {
        "date": date_daily,
        "total_vaccinations": total_vaccinations_d,
        "source_url": source_daily
    }])
    return df

示例#20

0

显示文件

 def parse_data(self, soup: BeautifulSoup, last_update: str) -> tuple:
     elems = self.get_elements(soup)
     records = []
     for elem in elems:
         if elem["date"] > last_update:
             # print(elem["date"], elem)
             soup = get_soup(elem["link"])
             record = {
                 "source_url": elem["link"],
                 "date": elem["date"],
                 **self.parse_data_news_page(soup),
             }
             records.append(record)
         else:
             # print(elem["date"], "END")
             return records, False
     return records, True

示例#21

0

显示文件

文件： bolivia.py 项目： wenjian80/covid-19-data

 def parse_metrics(self) -> tuple:
     soup = get_soup(self.source_url)
     elems = soup.find_all(class_="vacunados")
     if len(elems) != 2:
         raise ValueError(
             "Something changed in source layout. More than two elemnts with class='vacunados' were found."
         )
     for elem in elems:
         _ = elem.find_all("span")
         if _[0].text == "1ra Dosis":
             dose_1 = _[1].text
         elif _[0].text == "2da Dosis":
             dose_2 = _[1].text
         else:
             raise ValueError(
                 "Something changed in source layout. Name different than '1ra Dosis' or '2da Dosis'"
             )
     return clean_count(dose_1), clean_count(dose_2)

示例#22

0

显示文件

def read(source: str, last_update: str, num_pages_limit: int = 10):
    records = []
    for page_nr in range(1, num_pages_limit):
        # print(page_nr)
        # Get soup
        url = f"{source}/{page_nr}/"
        soup = get_soup(url)
        # Get data (if any)
        records_sub = parse_data(soup)
        if records_sub:
            records.extend(records_sub)
            if any([record["date"] <= last_update for record in records_sub]):
                # print("Dates exceding!  ", str([record["date"] for record in records_sub]))
                break
    if len(records) > 0:
        records = [record for record in records if record["date"] >= last_update]
        if len(records) > 0:
            return postprocess(pd.DataFrame(records))
    return None

示例#23

0

显示文件

def read(source: str) -> pd.Series:

    soup = get_soup(source)

    total_vaccinations = clean_count(
        soup.find(class_="stats-decoration-title").text)
    people_vaccinated = total_vaccinations
    people_fully_vaccinated = 0

    date = re.search(r"\d+ \w+ 202\d",
                     soup.find(class_="stats-decoration-text").text).group(0)
    date = str(pd.to_datetime(date).date())

    return pd.Series(
        data={
            "total_vaccinations": total_vaccinations,
            "people_vaccinated": people_vaccinated,
            "people_fully_vaccinated": people_fully_vaccinated,
            "date": date,
        })

示例#24

0

显示文件

def read(source: str) -> pd.Series:

    soup = get_soup(source)

    people_vaccinated = clean_count(
        re.search(r"^[\d,]+",
                  soup.find_all(class_="info-box-number")[2].text).group(0))
    people_fully_vaccinated = clean_count(
        re.search(r"^[\d,]+",
                  soup.find_all(class_="info-box-number")[3].text).group(0))
    total_vaccinations = people_vaccinated + people_fully_vaccinated

    date = localdate("Asia/Dhaka")

    return pd.Series(
        data={
            "total_vaccinations": total_vaccinations,
            "people_vaccinated": people_vaccinated,
            "people_fully_vaccinated": people_fully_vaccinated,
            "date": date,
        })

示例#25

0

显示文件

def read(source: str) -> pd.Series:
    soup = get_soup(source)

    counters = soup.find_all(class_="counter")
    people_partially_vaccinated = clean_count(counters[0].text)
    people_fully_vaccinated = clean_count(counters[1].text)
    total_vaccinations = clean_count(counters[2].text)
    people_vaccinated = people_partially_vaccinated + people_fully_vaccinated

    date = soup.find("span", id="last-update").text
    date = re.search(r"\d+.*202\d", date).group(0)
    date = clean_date(date, "%d %B, %Y")

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
        "source_url": source,
    }
    return pd.Series(data=data)

示例#26

0

显示文件

def connect_parse_data(source: str) -> pd.Series:

    soup = get_soup(source)
    people_vaccinated = soup.find_all(class_="repart-stlucia")[0].text
    people_vaccinated = clean_count(people_vaccinated)

    people_fully_vaccinated = soup.find_all(class_="repart-stlucia")[1].text
    people_fully_vaccinated = clean_count(people_fully_vaccinated)

    total_vaccinations = people_vaccinated + people_fully_vaccinated

    date = soup.find(class_="h2-blue").text
    date = re.search(r"\w+ +\d+, +202\d", date).group(0)
    date = clean_date(date, "%B %d, %Y")

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
    }
    return pd.Series(data=data)

示例#27

0

显示文件

def read(source: str) -> pd.Series:

    soup = get_soup(source)

    people_vaccinated = clean_count(
        re.search(r"^[\d,]+",
                  soup.find_all(class_="info-box-number")[2].text).group(0))
    people_fully_vaccinated = clean_count(
        re.search(r"^[\d,]+",
                  soup.find_all(class_="info-box-number")[3].text).group(0))
    total_vaccinations = people_vaccinated + people_fully_vaccinated

    date = soup.find(class_="main_foot").find("span").text.replace(
        "Last updated: ", "")

    return pd.Series(
        data={
            "total_vaccinations": total_vaccinations,
            "people_vaccinated": people_vaccinated,
            "people_fully_vaccinated": people_fully_vaccinated,
            "date": date,
        })

示例#28

0

显示文件

文件： south_korea.py 项目： ManchesterWuer/covid-19-data

def parse_data(source: str) -> pd.Series:

    soup = get_soup(source)
    html_table = str(soup.find_all("table")[2])
    df = pd.read_html(html_table, header=0)[0]

    assert len(df) <= 6, "New rows in the vaccine table!"

    astrazeneca = df.loc[df["백신"] == "아스트라제네카", "누적 접종(C)"].values.astype(int)
    pfizer = df.loc[df["백신"] == "화이자", "누적 접종(C)"].values.astype(int)
    johnson = df.loc[df["백신"] == "얀센2)", "누적 접종(C)"].values.astype(int)

    total_vaccinations = astrazeneca.sum() + pfizer.sum() + johnson[0]
    people_vaccinated = astrazeneca[0] + pfizer[0] + johnson[0]
    people_fully_vaccinated = astrazeneca[1] + pfizer[1] + johnson[0]

    data = {
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "total_vaccinations": total_vaccinations,
        "source_url": source,
    }
    return pd.Series(data=data)

示例#29

0

显示文件

文件： cyprus.py 项目： lumiroga/covid-19-data

def read(source: str) -> pd.Series:
    soup = get_soup(source)

    for block in soup.find(class_="main").find_all(class_="w3-center"):

        if block.find("p").text == "ΣΥΝΟΛΟ ΕΜΒΟΛΙΑΣΜΩΝ":
            total_vaccinations = clean_count(block.find_all("p")[1].text)
            date = re.search(r"[\d/]{8,10}", block.find_all("p")[2].text)
            date = clean_date(date.group(0), "%d/%m/%Y")

        if block.find("p").text == "ΣΥΝΟΛΟ 1ης ΔΟΣΗΣ":
            people_vaccinated = clean_count(block.find_all("p")[1].text)

        if block.find("p").text == "ΣΥΝΟΛΟ 2ης ΔΟΣΗΣ":
            people_fully_vaccinated = clean_count(block.find_all("p")[1].text)

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
        "source_url": source,
    }
    return pd.Series(data=data)

示例#30

0

显示文件

 def load_data(self) -> pd.DataFrame:
     """Load original data."""
     soup = utils.get_soup(self.source_url)
     link = self._parse_file_link(soup)
     return utils.read_xlsx_from_url(link, sheet_name="Date")