Python get_soup示例，cowidev.utils.web.get_soup Python示例

示例#1

0

显示文件

文件： albania.py 项目： LimLim0a0/covid-19-data

 def _get_text_from_url(self, url: str) -> str:
     """Extract text from the url."""
     soup = get_soup(url, verify=False)
     text = soup.find(
         "div",
         class_="pageDescription").get_text(strip=True).replace(",", "")
     return text

示例#2

0

显示文件

文件： lebanon.py 项目： LimLim0a0/covid-19-data

    def read(self):
        soup = get_soup(self.source_url)

        count = self._parse_count(soup)
        date = self._parse_date(soup)

        return {"count": count, "date": date}

示例#3

0

显示文件

 def find_article(self) -> str:
     soup = get_soup(self.feed_url)
     for link in soup.find_all("item"):
         elements = link.children
         for elem in elements:
             if "local-covid-19-situation" in elem:
                 return elem

示例#4

0

显示文件

def main():
    path = os.path.join(get_project_dir(), "scripts", "scripts", "testing",
                        "automated_sheets", "Nigeria.csv")
    data = pd.read_csv(path).sort_values(by="Date", ascending=False)

    source_url = "http://covid19.ncdc.gov.ng/"

    soup = get_soup(source_url)

    element = soup.find("div", class_="col-xl-3").find("span")
    cumulative_total = clean_count(element.text)

    if cumulative_total > data["Cumulative total"].max():

        new = pd.DataFrame({
            "Date": [localdate("Africa/Lagos")],
            "Cumulative total":
            cumulative_total,
            "Country":
            "Nigeria",
            "Units":
            "samples tested",
            "Source URL":
            source_url,
            "Source label":
            "Nigeria Centre for Disease Control",
        })

        df = pd.concat([new, data], sort=False)
        df.to_csv(path, index=False)

示例#5

0

显示文件

文件： azerbaijan.py 项目： LesterHoly/covid-19-data

def main():
    path = os.path.join(get_project_dir(), "scripts", "scripts", "testing",
                        "automated_sheets", "Azerbaijan.csv")
    data = pd.read_csv(path).sort_values(by="Date", ascending=False)

    source_url = "https://koronavirusinfo.az/az/page/statistika/azerbaycanda-cari-veziyyet"

    soup = get_soup(source_url)

    element = soup.find_all("div",
                            class_="gray_little_statistic")[5].find("strong")
    cumulative_total = clean_count(element.text)

    if cumulative_total > data["Cumulative total"].max():
        new = pd.DataFrame({
            "Cumulative total":
            cumulative_total,
            "Date": [localdate("Asia/Baku")],
            "Country":
            "Azerbaijan",
            "Units":
            "tests performed",
            "Source URL":
            source_url,
            "Source label":
            "Cabinet of Ministers of Azerbaijan",
        })

        df = pd.concat([new, data], sort=False)
        df.to_csv(path, index=False)

示例#6

0

显示文件

def main():
    path = os.path.join(get_project_dir(), "scripts", "scripts", "testing",
                        "automated_sheets", "Kenya.csv")
    data = pd.read_csv(path).sort_values(by="Date", ascending=False)

    source_url = "http://covidkenya.org/"

    soup = get_soup(source_url)

    element = soup.find("div", class_="elementor-element-b36fad5").find(
        class_="elementor-text-editor")
    cumulative_total = clean_count(element.text)

    date_raw = soup.select(".elementor-element-75168b2 p")[0].text
    date = extract_clean_date(
        date_raw,
        regex=r"\[Updated on ([A-Za-z]+ \d+) \[\d\d:\d\d\]",
        date_format="%B %d",
        replace_year=2021)

    if cumulative_total > data["Cumulative total"].max():
        new = pd.DataFrame({
            "Cumulative total": cumulative_total,
            "Date": [date],
            "Country": "Kenya",
            "Units": "samples tested",
            "Source URL": source_url,
            "Source label": "Kenya Ministry of Health",
        })

        df = pd.concat([new, data], sort=False)
        df.to_csv(path, index=False)

示例#7

0

显示文件

文件： lebanon.py 项目： LesterHoly/covid-19-data

def main():
    path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Lebanon.csv")
    data = pd.read_csv(path).sort_values(by="Date", ascending=False)

    source_url = "https://corona.ministryinfo.gov.lb/"

    soup = get_soup(source_url)

    element = soup.find("h1", class_="s-counter3")
    cumulative_total = clean_count(element.text)

    date_raw = soup.select(".last-update strong")[0].text
    date = extract_clean_date(date_raw, regex=r"([A-Za-z]+ \d+)", date_format="%b %d", replace_year=2021)

    if cumulative_total > data["Cumulative total"].max():
        new = pd.DataFrame(
            {
                "Cumulative total": cumulative_total,
                "Date": [date],
                "Country": "Lebanon",
                "Units": "tests performed",
                "Source URL": source_url,
                "Source label": "Lebanon Ministry of Health",
            }
        )

        df = pd.concat([new, data], sort=False)
        df.to_csv(path, index=False)

示例#8

0

显示文件

def main():
    path = os.path.join(get_project_dir(), "scripts", "scripts", "testing",
                        "automated_sheets", "Tunisia.csv")
    data = pd.read_csv(path).sort_values(by="Date", ascending=False)

    source_url = "https://onmne.tn"

    soup = get_soup(source_url)

    cumulative_total = json.loads(
        soup.find(
            "span",
            class_="vcex-milestone-time").attrs["data-options"])["endVal"]

    Date = soup.select("p span")[0].text.replace(
        "Chiffres clés mis à jour le ", "")
    Date = pd.to_datetime(Date, format="%d %B %Y").strftime("%Y-%m-%d")

    if cumulative_total > data["Cumulative total"].max():
        new = pd.DataFrame({
            "Cumulative total": cumulative_total,
            "Date": [Date],
            "Country": "Tunisia",
            "Units": "people tested",
            "Source URL": source_url,
            "Source label": "Tunisia Ministry of Health",
        })

        df = pd.concat([new, data], sort=False)
        df.to_csv(path, index=False)

示例#9

0

显示文件

文件： russia.py 项目： LimLim0a0/covid-19-data

def read(source: str) -> pd.Series:
    soup = get_soup(source)

    text = soup.find("div", id="data").find("p").text

    date = re.search(r"На сегодня \(([\d\.]{8})\)", text).group(1)
    date = clean_date(date, "%d.%m.%y")

    people_vaccinated = re.search(
        r"([\d\s]+) чел\. \([\d\.]+% от населения[^)]*\) - привито хотя бы одним компонентом вакцины",
        text,
    ).group(1)
    people_vaccinated = clean_count(people_vaccinated)

    people_fully_vaccinated = re.search(
        r"([\d\s]+) чел\. \([\d\.]+% от населения,?[^)]*\) - полностью привито",
        text).group(1)
    people_fully_vaccinated = clean_count(people_fully_vaccinated)

    total_vaccinations = re.search(r"([\d\s]+) шт\. - всего прививок сделано",
                                   text).group(1)
    total_vaccinations = clean_count(total_vaccinations)

    total_boosters = re.search(r"([\d\s]+) чел\. - прошли ревакцинацию",
                               text).group(1)
    total_boosters = clean_count(total_boosters)

    return pd.Series({
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "total_boosters": total_boosters,
        "date": date,
    })

示例#10

0

显示文件

 def _get_text_from_url(self, url: str) -> str:
     """Extract text from URL."""
     soup = get_soup(url)
     text = soup.get_text()
     text = re.sub(r"(\d)\.(\d)", r"\1\2", text)
     text = re.sub(r"\s+", " ", text)
     return text

示例#11

0

显示文件

文件： russia.py 项目： LimLim0a0/covid-19-data

 def _get_text_and_date_from_url(self, url: str) -> tuple:
     """Extract text from the url."""
     soup = get_soup(url)
     date = self._parse_date(soup)
     text = soup.find(class_="news-detail").text.replace("\n", " ").replace(
         "\xa0", "")
     text = re.sub(r'(\d)\s+(\d)', r'\1\2', text)
     return text, date

示例#12

0

显示文件

文件： bulgaria.py 项目： LimLim0a0/covid-19-data

 def _parse_count(self, soup):
     # Read all tables
     soup = get_soup(self.source_url)
     tables = pd.read_html(str(soup))
     columns = {"Тип", "Общо", "Нови"}
     for table in tables:
         if not columns.difference(table.columns) and "RT PCR" in table["Тип"].tolist():
             return table.loc[table["Тип"] == "Общо", "Общо"].item()
     raise ValueError(f"Table not found! It may have changed its format.")

示例#13

0

显示文件

 def _get_relevant_table(self, url: str) -> element.Tag:
     """Get the table with the relevant data"""
     soup = get_soup(url)
     tables = soup.find_all("table")
     table = [
         table for table in tables
         if table.findChild("caption").text == "Tests COVID-19"
     ][0]
     return str(table)

示例#14

0

显示文件

 def _get_records(self, url: str) -> dict:
     soup = get_soup(url)
     elem = soup.find(id="newsContent")
     elems = elem.find_all("table")
     records = [{
         "Date": self._parse_date(elem),
         "Cumulative total": self._parse_metric(elem),
     } for elem in elems]
     return records

示例#15

0

显示文件

文件： iceland.py 项目： LimLim0a0/covid-19-data

 def _load_data(self, data_id):
     """Load data from source"""
     url = f"{self.source_url}{data_id}"
     soup = get_soup(url)
     match = re.search(self.regex["element"], str(soup))
     if not match:
         raise ValueError(
             "Website Structure Changed, please update the script")
     data = json.loads(match.group(1))
     return data

示例#16

0

显示文件

 def _parse_data(self):
     soup = get_soup(self.source_url)
     date_raw = soup.select("span+ span")[0].text
     return {
         "count":
         clean_count(
             soup.select(".bg-success:nth-child(1) .info-box-number")
             [0].text),
         "date":
         clean_date(date_raw, "%d/%m/%Y"),
     }

示例#17

0

显示文件

文件： bahrain.py 项目： LimLim0a0/covid-19-data

 def _parse_data(self):
     soup = get_soup(self.source_url)
     date_raw = soup.select_one("#lastupdated ul li").text
     return {
         "count":
         clean_count(soup.select_one("#renderbody table th span").text),
         "date":
         extract_clean_date(date_raw,
                            regex=r"(\d+/\d+/20\d+).*",
                            date_format="%d/%m/%Y"),
     }

示例#18

0

显示文件

文件： russia.py 项目： LimLim0a0/covid-19-data

    def read(self) -> pd.Series:
        data = []

        for cnt in range(1, self._num_max_pages + 1):
            url = f"{self._base_url}{self._url_subdirectory}{cnt}"
            soup = get_soup(url)
            data, proceed = self._parse_data(soup)
            if not proceed:
                break

        return pd.Series(data)

示例#19

0

显示文件

文件： bahrain.py 项目： LesterHoly/covid-19-data

def read(source: str) -> pd.Series:
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache",
    }
    soup = get_soup(source)
    return parse_data(soup)

示例#20

0

显示文件

文件： fiji.py 项目： LimLim0a0/covid-19-data

    def read(self) -> pd.Series:
        """Read data from source."""
        data = []

        for cnt in range(1, self._num_max_pages + 1):
            url = f"{self.source_url}{cnt}/"
            soup = get_soup(url)
            for _ in range(self._num_rows_per_page):
                data, proceed = self._parse_data(soup)
                if not proceed:
                    return pd.Series(data)
        return None

示例#21

0

显示文件

def main():

    soup = get_soup(METADATA["source_url_ref"])
    records = json.loads(soup.find("cv-stats-virus")[":charts-data"])

    df = (
        pd.DataFrame.from_records(records, columns=["date", "hospitalized"])
        .rename(columns={"hospitalized": "value"})
        .assign(entity=METADATA["entity"], indicator="Weekly new hospital admissions")
    )
    df["date"] = clean_date_series(df.date, "%d.%m.%Y")
    df = df[df.value > 0].sort_values("date")
    df["value"] = df.value.rolling(7).sum()
    df = df.dropna(subset=["value"])

    return df, METADATA

示例#22

0

显示文件

 def _parse_data(self, soup: BeautifulSoup) -> dict:
     """Get data from the source page."""
     # Get relevant link
     url = self._get_relevant_link(soup)
     # Extract text from url
     text = self._get_text_from_url(url)
     # Extract date from text
     soup = get_soup(url)
     date = self._parse_date_from_text(soup)
     # Extract metrics from text
     count = self._parse_metrics(text)
     record = {
         "source_url": url,
         "date": date,
         "count": count,
     }
     return record

示例#23

0

显示文件

文件： suriname.py 项目： LimLim0a0/covid-19-data

    def read(self) -> pd.DataFrame:
        """Read data from source"""
        body = str(get_soup(self.source_url))

        # Get count
        count = 0
        if "Totaal Testen" in body:
            count = int(body.split("Totaal Testen")[0].split('data-counter-value="')[-1].split('"')[0])
        # Get negative results
        negative = 0
        if "Totaal negatieve" in body:
            negative = int(body.split("Totaal negatieve")[0].split('data-counter-value="')[-1].split('"')[0])

        df = pd.DataFrame(
            {
                "Date": [localdate("America/Paramaribo")],
                "Daily change in cumulative total": [count],
                "positive": [count - negative],
            }
        )
        return df

示例#24

0

显示文件

文件： el_salvador.py 项目： LimLim0a0/covid-19-data

 def _load_data(self, data_id: str) -> pd.DataFrame:
     """Load data from source"""
     url = f"{self.source_url}{data_id}"
     soup = get_soup(url)
     match = re.search(self.regex["element"], str(soup))
     if not match:
         raise ValueError(
             "Website Structure Changed, please update the script")
     data = json.loads(match.group(1))
     data = data["elements"]["content"]["content"]["entities"]
     data = [
         data[idx] for idx in data
         if re.search(self.regex["title"], str(data[idx].values()))
     ][0]
     data_list = data["props"]["chartData"]["data"]
     df = pd.DataFrame()
     for frame in data_list:
         col = frame.pop(0)
         col[0] = "Date"
         df = df.append(pd.DataFrame(frame, columns=col), ignore_index=True)
     return df

示例#25

0

显示文件

文件： iceland.py 项目： LimLim0a0/covid-19-data

 def _get_data_id_from_source(self, source_url: str) -> str:
     """Get Data ID from source"""
     soup = get_soup(source_url)
     data_id = soup.find(class_="infogram-embed")["data-id"]
     return data_id

示例#26

0

显示文件

文件： ukraine.py 项目： LimLim0a0/covid-19-data

 def read(self) -> pd.Series:
     """Read data from source."""
     soup = get_soup(self.source_url)
     data = self._parse_data(soup)
     return pd.Series(data)

示例#27

0

显示文件

文件： moldova.py 项目： LesterHoly/covid-19-data

def read(source: str) -> pd.Series:
    soup = get_soup(source)
    return parse_data(soup)

示例#28

0

显示文件

def main(paths):

    url = "https://e.infogram.com/c3bc3569-c86d-48a7-9d4c-377928f102bf"
    soup = get_soup(url)

    for script in soup.find_all("script"):
        if "infographicData" in str(script):
            json_data = str(script).replace("<script>window.infographicData=",
                                            "").replace(";</script>", "")
            json_data = json.loads(json_data)
            break

    metric_entities = {
        "total_vaccinations": "7287c058-7921-4abc-a667-ce298827c969",
        "people_vaccinated": "8d14f33a-d482-4176-af55-71209314b07b",
        "people_fully_vaccinated": "16a69e30-01fd-4806-920c-436f8f29e9bf",
        "total_boosters": "209af2de-9927-4c51-a704-ddc85e28bab9",
    }
    data = {}

    for metric, entity in metric_entities.items():
        value = json_data["elements"]["content"]["content"]["entities"][
            entity]["props"]["chartData"]["data"][0][0][0]
        value = re.search(r'18px;">([\d\.]+)', value).group(1)
        value = clean_count(value)
        data[metric] = value

    date = json_data["updatedAt"][:10]

    increment(
        paths=paths,
        location="Iceland",
        total_vaccinations=data["total_vaccinations"],
        people_vaccinated=data["people_vaccinated"],
        people_fully_vaccinated=data["people_fully_vaccinated"],
        total_boosters=data["total_boosters"],
        date=date,
        source_url="https://www.covid.is/tolulegar-upplysingar-boluefni",
        vaccine=", ".join(sorted(VACCINE_MAPPING.values())),
    )

    # By manufacturer
    data = json_data["elements"]["content"]["content"]["entities"][
        "e329559c-c3cc-48e9-8b7b-1a5f87ea7ad3"]["props"]["chartData"]["data"][
            0]
    df = pd.DataFrame(data[1:]).reset_index(drop=True)
    df.columns = ["date"] + data[0][1:]

    df = df.melt("date", var_name="vaccine", value_name="total_vaccinations")

    df["date"] = pd.to_datetime(df["date"], format="%d.%m.%y").astype(str)
    df["total_vaccinations"] = pd.to_numeric(df["total_vaccinations"],
                                             errors="coerce").fillna(0)
    df["total_vaccinations"] = df.sort_values("date").groupby(
        "vaccine", as_index=False)["total_vaccinations"].cumsum()
    df["location"] = "Iceland"

    assert set(df["vaccine"].unique()) == set(VACCINE_MAPPING.keys(
    )), f"Vaccines present in data: {df['vaccine'].unique()}"
    df = df.replace(VACCINE_MAPPING)

    df.to_csv(paths.tmp_vax_out_man("Iceland"), index=False)
    export_metadata(df, "Ministry of Health", url, paths.tmp_vax_metadata_man)

示例#29

0

显示文件

文件： bolivia.py 项目： LesterHoly/covid-19-data

 def _parse_date(self):
     print(self.source_url)
     soup = get_soup(self.source_url)
     return extract_clean_date(soup.text, "Reporte (?:(?:V|v)acunación|COVID\-19) (\d\d\-\d\d\-20\d\d)", "%d-%m-%Y")

示例#30

0

显示文件

文件： bulgaria.py 项目： LimLim0a0/covid-19-data

 def read(self) -> pd.Series:
     soup = get_soup(self.source_url)
     return self._parse_data(soup)