Exemplos de get_soup em Python, exemplos de cowidev.utils.web.scraping.get_soup em Python

Exemplo n.º 1

0

Exibir arquivo

def connect_parse_data(source: str) -> pd.Series:

    soup = get_soup(source)

    counters = soup.find_all(class_="elementor-counter-number")
    assert len(counters) == 6, "New counter in dashboard?"

    total_vaccinations = clean_count(counters[0]["data-to-value"])
    first_doses = clean_count(counters[1]["data-to-value"])
    second_doses = clean_count(counters[2]["data-to-value"])
    unique_doses = clean_count(counters[3]["data-to-value"])
    booster_shots = clean_count(counters[4]["data-to-value"])
    immunocompromised_doses = clean_count(counters[5]["data-to-value"])

    people_vaccinated = first_doses + unique_doses
    people_fully_vaccinated = second_doses + unique_doses
    total_boosters = booster_shots + immunocompromised_doses

    date = localdate("America/Jamaica")

    return pd.Series(
        data={
            "total_vaccinations": total_vaccinations,
            "people_vaccinated": people_vaccinated,
            "people_fully_vaccinated": people_fully_vaccinated,
            "total_boosters": total_boosters,
            "date": date,
        }
    )

Exemplo n.º 2

0

Exibir arquivo

Arquivo: azerbaijan.py Projeto: LesterHoly/covid-19-data

def read(source: str):
    soup = get_soup(source)
    url = parse_pdf_link(soup, source)
    if not url.endswith(".pdf"):
        raise ValueError(f"File reporting metrics is not a PDF: {url}!")
    ds = pd.Series(parse_data(url))
    return ds

Exemplo n.º 3

0

Exibir arquivo

def read(source: str) -> pd.Series:
    soup = get_soup(source)

    for block in soup.find(class_="main").find_all(class_="w3-center"):

        if block.find("p").text == "ΣΥΝΟΛΟ ΕΜΒΟΛΙΑΣΜΩΝ":
            total_vaccinations = clean_count(block.find_all("p")[1].text)
            date = re.search(r"[\d/]{8,10}", block.find_all("p")[2].text)
            date = clean_date(date.group(0), "%d/%m/%Y")

        if block.find("p").text == "ΣΥΝΟΛΟ 1ης ΔΟΣΗΣ":
            people_vaccinated = clean_count(block.find_all("p")[1].text)

        if block.find("p").text == "ΣΥΝΟΛΟ 2ης ΔΟΣΗΣ":
            people_fully_vaccinated = clean_count(block.find_all("p")[1].text)

        if block.find("p").text == "ΣΥΝΟΛΟ 3ης ΔΟΣΗΣ":
            total_boosters = clean_count(block.find_all("p")[1].text)

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "total_boosters": total_boosters,
        "date": date,
        "source_url": source,
    }
    return pd.Series(data=data)

Exemplo n.º 4

0

Exibir arquivo

 def read(self, last_update: str) -> pd.DataFrame:
     # Get Newest Month Report Page
     url_month = self.base_url_template.format(self._current_month)
     soup_month = get_soup(url_month)
     # Get links
     df = self._parse_data(soup_month, last_update)
     return df

Exemplo n.º 5

0

Exibir arquivo

 def _parse_single_doses(self):
     url = "http://103.247.238.92/webportal/pages/covid19-vaccination-johnson.php"
     soup = get_soup(url, timeout=30)
     metrics = self._parse_metrics_raw(soup, raise_err=False)
     if metrics["people_vaccinated"] != 0:
         raise ValueError("First dose for one dose vaccines should be 0!")
     return metrics["people_fully_vaccinated"]

Exemplo n.º 6

0

Exibir arquivo

Arquivo: denmark.py Projeto: LesterHoly/covid-19-data

 def pipe_total_vax_bfill(self, df: pd.DataFrame,
                          n_days: int) -> pd.DataFrame:
     soup = get_soup(self.source_url_ref)
     links = self._get_zip_links(soup)
     links = links[:n_days]
     df = self._backfill_total_vaccinations(df, links)
     return df

Exemplo n.º 7

0

Exibir arquivo

Arquivo: macao.py Projeto: LesterHoly/covid-19-data

def parse_vaccinations(elem) -> dict:
    # Get news text
    url = elem.find_parent(class_="card").find("a").get("href")
    soup = get_soup(url, verify=False)
    text = "\n".join([p.text for p in soup.find("article").find_all("p")])

    # Find metrics
    metrics = dict()
    # total_vaccinations = re.search(r"疫苗共有(?P<count>[\d,]*)人次", text)
    total_vaccinations = re.search(r"疫苗劑數為(?P<count>[\d,]*)劑", text)
    # print(total_vaccinations)
    # people_vaccinated = re.search(r"1劑疫苗共有(?P<count>[\d,]*)人次", text)
    people_vaccinated = re.search(r"已接種人數共有(?P<count>[\d,]*)人", text)
    # people_fully_vaccinated = re.search(r"2劑疫苗共有(?P<count>[\d,]*)人次", text)
    # people_fully_vaccinated = re.search(r"已完成接種2劑有(?P<count>[\d,]*)人", text)
    people_fully_vaccinated = re.search(r"已接種第2劑的?有([\d,]{6,})", text)
    # people_fully_vaccinated = re.search(r"接種2劑有(?P<count>[\d,]*)人", text)

    if total_vaccinations:
        metrics["total_vaccinations"] = clean_count(
            total_vaccinations.group(1))
    if people_vaccinated:
        metrics["people_vaccinated"] = clean_count(people_vaccinated.group(1))
    if people_fully_vaccinated:
        metrics["people_fully_vaccinated"] = clean_count(
            people_fully_vaccinated.group(1))
    return metrics

Exemplo n.º 8

0

Exibir arquivo

Arquivo: equatorial_guinea.py Projeto: LimLim0a0/covid-19-data

 def read(self) -> pd.Series:
     soup = get_soup(self.source_url)
     data = self.parse_metrics(soup)
     date_str = self.parse_date(soup)
     return pd.Series({
         **data,
         "date": date_str,
     })

Exemplo n.º 9

0

Exibir arquivo

 def read(self):
     for cnt in range(1, self._num_max_pages + 1):
         url = f"{self.source_url}/{cnt}/"
         soup = get_soup(url, verify=False)
         data, proceed = self.parse_data(soup)
         if not proceed:
             break
     return pd.Series(data)

Exemplo n.º 10

0

Exibir arquivo

 def _parse_data(self) -> pd.Series:
     soup = get_soup(self.source_url)
     metrics = self._parse_metrics(soup)
     data = {
         **metrics,
         "date": self._parse_date(soup),
     }
     return pd.Series(data=data)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: el_salvador.py Projeto: LesterHoly/covid-19-data

 def read(self) -> pd.Series:
     soup = get_soup(self.source_url)
     link = self.parse_infogram_link(soup)
     soup = get_soup(link)
     infogram_data = self.parse_infogram_data(soup)
     return pd.Series({
         "date":
         self.parse_infogram_date(infogram_data),
         "source_url":
         self.source_url,
         "total_vaccinations":
         self._parse_total_vaccinations(infogram_data),
         "people_vaccinated":
         self._parse_people_vaccinated(infogram_data),
         "people_fully_vaccinated":
         self._parse_people_fully_vaccinated(infogram_data),
         "total_boosters":
         self._parse_boosters(infogram_data),
     })

Exemplo n.º 12

0

Exibir arquivo

Arquivo: vietnam.py Projeto: LesterHoly/covid-19-data

 def read(self, last_updated: str) -> pd.DataFrame:
     soup = get_soup(self.source_url)
     news_info_all = self._parse_news_info(soup)
     records = []
     for news_info in news_info_all:
         # print(news_info)
         if news_info["date"] < last_updated:
             break
         records.append(self._parse_metrics(news_info))
     return pd.DataFrame(records)

Exemplo n.º 13

0

Exibir arquivo

 def _parse_pdf_link(self, soup) -> str:
     for a in soup.find(class_="download").find_all("a"):
         if "疫苗接種統計資料" in a["title"]:
             break
     url_pdf = f"{self.source_url}{a['href']}"
     for i in range(10):
         soup = get_soup(url_pdf)
         a = soup.find(class_="viewer-button")
         if a is not None:
             break
     return f"{self.source_url}{a['href']}"

Exemplo n.º 14

0

Exibir arquivo

Arquivo: vietnam.py Projeto: LesterHoly/covid-19-data

 def _parse_metrics(self, news_info: dict):
     soup = get_soup(news_info["link"])
     text = clean_string(soup.text)
     metrics = re.search(self.regex["metrics"], text).group(1, 2, 3)
     return {
         "total_vaccinations": clean_count(metrics[0]),
         "people_vaccinated": clean_count(metrics[1]),
         "people_fully_vaccinated": clean_count(metrics[2]),
         "source_url": news_info["link"],
         "date": news_info["date"],
     }

Exemplo n.º 15

0

Exibir arquivo

    def vaccines_approved(self, location: str = None, original_names: bool = False) -> list:
        """Get list of approved vaccines in a country (or all if None specified).

        Args:
            location (str, optional): Country name. If None, retrieves all approved vaccines. Defaults to None.
            original_names (bool, optional): Set to True to keep vaccine from web. Defaults to False.

        Returns:
            list: Approved vaccines
        """
        if location:
            try:
                url = self.get_country_url(location)
                soup = get_soup(url)
                return self._parse_vaccines_location(soup, original_names)
            except ValueError:
                return None
        else:
            soup = get_soup(self.all_vaccines_url)
            return self._parse_vaccines_all(soup, original_names)

Exemplo n.º 16

0

Exibir arquivo

 def parse_data(self, soup: BeautifulSoup) -> tuple:
     elem = self.get_last_element(soup)
     if elem is None:
         return None, True
     soup = get_soup(elem["link"], verify=False)
     record = {
         "source_url": elem["link"],
         "date": elem["date"],
         **self.parse_data_news_page(soup),
     }
     return record, False

Exemplo n.º 17

0

Exibir arquivo

Arquivo: hungary.py Projeto: LesterHoly/covid-19-data

 def read(self, last_update: str) -> pd.DataFrame:
     data = []
     for cnt in range(0, self._num_max_pages):
         # print(f"page: {cnt}")
         url = f"{self.source_url}/hirek?page={cnt}/"
         soup = get_soup(url)
         data_, proceed = self.parse_data(soup, last_update)
         data.extend(data_)
         if not proceed:
             break
     return pd.DataFrame(data)

Exemplo n.º 18

0

Exibir arquivo

 def read(self) -> pd.Series:
     soup = get_soup(self.source_url, timeout=30)
     metrics = self._parse_metrics(soup)
     vaccines = self._parse_vaccines(soup)
     date = localdate("Asia/Dhaka")
     return pd.Series(
         data={
             **metrics,
             "date": date,
             "vaccine": vaccines,
         }
     )

Exemplo n.º 19

0

Exibir arquivo

Arquivo: north_macedonia.py Projeto: LesterHoly/covid-19-data

def connect_parse_data(source: str) -> pd.Series:

    soup = get_soup(source)

    people_vaccinated = soup.find(class_="count-up").text
    people_vaccinated = clean_count(people_vaccinated)

    total_vaccinations = people_vaccinated

    return pd.Series(
        data={
            "total_vaccinations": total_vaccinations,
            "people_vaccinated": people_vaccinated,
        })

Exemplo n.º 20

0

Exibir arquivo

 def read(self) -> pd.Series:
     soup = get_soup(self.source_url)
     (
         total_vaccinations,
         people_vaccinated,
         people_fully_vaccinated,
         total_boosters,
     ) = self._parse_metrics(soup)
     return pd.Series({
         "total_vaccinations": total_vaccinations,
         "people_vaccinated": people_vaccinated,
         "people_fully_vaccinated": people_fully_vaccinated,
         "total_boosters": total_boosters,
         "source_url": self.source_url,
         "date": self._parse_date(soup),
     })

Exemplo n.º 21

0

Exibir arquivo

 def parse_data(self, soup: BeautifulSoup, last_update: str) -> tuple:
     elems = self.get_elements(soup)
     records = []
     for elem in elems:
         if elem["date"] > last_update:
             # print(elem["date"], elem)
             soup = get_soup(elem["link"])
             record = {
                 "source_url": elem["link"],
                 **self.parse_data_news_page(soup),
             }
             records.append(record)
         else:
             # print(elem["date"], "END")
             return records, False
     return records, True

Exemplo n.º 22

0

Exibir arquivo

Arquivo: new_zealand.py Projeto: LesterHoly/covid-19-data

    def read(self) -> pd.DataFrame:
        soup = get_soup(self.source_url)

        # Get latest figures from HTML table
        latest = pd.read_html(str(soup.find("table")))[0]
        latest_date = re.search(r"Data in this section is as at 11:59pm ([\d]+ [A-Za-z]+ 202\d)", soup.text).group(1)
        self.latest = pd.DataFrame(
            {
                "total_vaccinations": latest.loc[latest["Unnamed: 0"] == "Total doses", "Cumulative total"].item(),
                "people_vaccinated": latest.loc[latest["Unnamed: 0"] == "First dose", "Cumulative total"].item(),
                "people_fully_vaccinated": latest.loc[
                    latest["Unnamed: 0"] == "Second dose", "Cumulative total"
                ].item(),
                "date": [clean_date(latest_date, fmt="%d %B %Y", lang="en")],
            }
        )

        link = self._parse_file_link(soup)
        df = read_xlsx_from_url(link, sheet_name="Date")
        return df

Exemplo n.º 23

0

Exibir arquivo

Arquivo: netherlands.py Projeto: LesterHoly/covid-19-data

def main(paths):
    soup = get_soup(URL)
    script = soup.find("script", id="__NEXT_DATA__")
    data = json.loads(script.string)

    doses = (pd.DataFrame.from_records(
        data["props"]["pageProps"]["selectedNlData"]
        ["vaccine_administered_total"]["values"]).rename(
            columns={
                "date_unix": "date",
                "estimated": "total_vaccinations"
            }).drop(columns=["reported", "date_of_insertion_unix"]))
    doses["date"] = pd.to_datetime(doses.date, unit="s").dt.date.astype(str)

    coverage = (pd.DataFrame.from_records(
        data["props"]["pageProps"]["selectedNlData"]["vaccine_coverage"]
        ["values"]).rename(
            columns={
                "date_end_unix": "date",
                "fully_vaccinated": "people_fully_vaccinated",
                "partially_or_fully_vaccinated": "people_vaccinated",
            }).drop(columns=[
                "date_of_insertion_unix", "partially_vaccinated",
                "date_start_unix"
            ]))
    coverage["date"] = pd.to_datetime(coverage.date,
                                      unit="s").dt.date.astype(str)

    df = (pd.merge(
        doses, coverage, on="date", how="outer",
        validate="one_to_one").sort_values("date").assign(
            location="Netherlands",
            source_url=
            "https://coronadashboard.government.nl/landelijk/vaccinaties",
        ).pipe(enrich_vaccine_name))

    df = df[(df.total_vaccinations >= df.people_vaccinated)
            | (df.total_vaccinations.isna())
            | (df.people_vaccinated.isna())]

    df.to_csv(paths.tmp_vax_out("Netherlands"), index=False)

Exemplo n.º 24

0

Exibir arquivo

def read(source: str) -> pd.Series:

    soup = get_soup(source)

    people_vaccinated = clean_count(
        re.search(r"^[\d,]+",
                  soup.find_all(class_="info-box-number")[2].text).group(0))
    people_fully_vaccinated = clean_count(
        re.search(r"^[\d,]+",
                  soup.find_all(class_="info-box-number")[3].text).group(0))
    total_vaccinations = people_vaccinated + people_fully_vaccinated

    date = localdate("Asia/Dhaka")

    return pd.Series(
        data={
            "total_vaccinations": total_vaccinations,
            "people_vaccinated": people_vaccinated,
            "people_fully_vaccinated": people_fully_vaccinated,
            "date": date,
        })

Exemplo n.º 25

0

Exibir arquivo

Arquivo: spain.py Projeto: LimLim0a0/covid-19-data

def main() -> pd.DataFrame:
    soup = get_soup(METADATA["source_url_ref"])
    url = soup.find(class_="informacion").find("a")["href"]
    url = "https://www.mscbs.gob.es/profesionales/saludPublica/ccayes/alertasActual/nCov/" + url

    df = pd.read_csv(
        url,
        usecols=[
            "Fecha", "Unidad", "OCUPADAS_COVID19", "INGRESOS_COVID19",
            "Provincia", "CCAA"
        ],
        encoding="Latin-1",
        sep=";",
    )
    df["Fecha"] = clean_date_series(df.Fecha, "%d/%m/%Y")
    df = df.drop_duplicates(subset=["Fecha", "Unidad", "Provincia", "CCAA"],
                            keep="first").dropna(subset=["Unidad"])
    df.loc[df.Unidad.str.contains("U. Críticas"), "Unidad"] = "ICU"

    df = (df.drop(columns=["Provincia", "CCAA"]).groupby(
        ["Fecha", "Unidad"], as_index=False).sum().sort_values("Unidad").pivot(
            index="Fecha",
            columns="Unidad").reset_index().sort_values("Fecha"))
    df.columns = ["date", "hosp_stock", "icu_stock", "hosp_flow", "icu_flow"]

    df["hosp_flow"] = df.hosp_flow.rolling(7).sum()
    df["icu_flow"] = df.icu_flow.rolling(7).sum()

    df = df.melt("date", var_name="indicator").dropna(subset=["value"])
    df["indicator"] = df.indicator.replace({
        "hosp_flow": "Weekly new hospital admissions",
        "icu_flow": "Weekly new ICU admissions",
        "hosp_stock": "Daily hospital occupancy",
        "icu_stock": "Daily ICU occupancy",
    })

    df["entity"] = METADATA["entity"]

    return df, METADATA

Exemplo n.º 26

0

Exibir arquivo

Arquivo: macao.py Projeto: LesterHoly/covid-19-data

def read(source: str, last_update: str, num_pages_limit: int = 10):
    records = []
    for page_nr in range(1, num_pages_limit):
        # print(page_nr)
        # Get soup
        url = f"{source}/{page_nr}/"
        soup = get_soup(url, verify=False)
        # Get data (if any)
        records_sub = parse_data(soup)
        if records_sub:
            records.extend(records_sub)
            if any([record["date"] <= last_update for record in records_sub]):
                # print("Dates exceding!  ", str([record["date"] for record in records_sub]))
                break
    if pd.Series([r.get("total_vaccinations")
                  for r in records]).notnull().any():
        records = [
            record for record in records if record["date"] >= last_update
        ]
        if len(records) > 0:
            return postprocess(pd.DataFrame(records))
    return None

Exemplo n.º 27

0

Exibir arquivo

Arquivo: denmark.py Projeto: LimLim0a0/covid-19-data

def read() -> pd.DataFrame:
    soup = get_soup(METADATA["source_url_hosp"])
    zip_url = soup.find("accordions").find("a").get("href")

    with tempfile.TemporaryDirectory() as tf:
        r = requests.get(zip_url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(tf)
        flow = pd.read_csv(
            os.path.join(tf, "Regionalt_DB",
                         "06_nye_indlaeggelser_pr_region_pr_dag.csv"),
            encoding="ISO 8859-1",
            sep=";",
            usecols=["Dato", "Indlæggelser"],
        )
        stock = pd.read_csv(
            os.path.join(tf, "Regionalt_DB",
                         "15_indlagte_pr_region_pr_dag.csv"),
            encoding="ISO 8859-1",
            sep=";",
            usecols=["Dato", "Indlagte"],
        )
    return flow, stock

Exemplo n.º 28

0

Exibir arquivo

def connect_parse_data(source: str) -> pd.Series:

    soup = get_soup(source)

    az_dose1 = clean_count(soup.find_all(class_="yellow")[0].text)
    az_dose2 = clean_count(soup.find_all(class_="yellow")[1].text)
    assert az_dose1 >= az_dose2
    pfizer_dose1 = clean_count(soup.find_all(class_="yellow")[2].text)
    pfizer_dose2 = clean_count(soup.find_all(class_="yellow")[3].text)
    assert pfizer_dose1 >= pfizer_dose2

    people_vaccinated = az_dose1 + pfizer_dose1
    people_fully_vaccinated = az_dose2 + pfizer_dose2
    total_vaccinations = people_vaccinated + people_fully_vaccinated

    date = localdate("America/St_Lucia")

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
    }
    return pd.Series(data=data)

Exemplo n.º 29

0

Exibir arquivo

Arquivo: japan.py Projeto: LesterHoly/covid-19-data

def connect_parse_data(source: str) -> pd.Series:

    soup = get_soup(source)

    df = pd.read_html(str(soup.find(class_="vaccination-count")))[0]
    assert df.shape == (3, 7)

    values = df.iloc[:, 2].values

    total_vaccinations = values[0]
    people_vaccinated = values[1]
    people_fully_vaccinated = values[2]
    assert total_vaccinations == people_vaccinated + people_fully_vaccinated

    date = soup.find(class_="aly_tx_center").text
    date = localdate("Asia/Tokyo")

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
    }
    return pd.Series(data=data)

Exemplo n.º 30

0

Exibir arquivo

def read(source: str) -> pd.Series:
    soup = get_soup(source)
    return parse_data(soup)