示例#1
0
import scrape_common as sc

base_url = 'https://www.jura.ch'
url = f'{base_url}/fr/Autorites/Coronavirus/Chiffres-H-JU/Evolution-des-cas-COVID-19-dans-le-Jura.html'
d = sc.download(url, silent=True)
d = d.replace(' ', ' ')
soup = BeautifulSoup(d, 'html.parser')

pdf_url = soup.find('a', title=re.compile(r'.*PDF.*')).get('href')
if not pdf_url.startswith('http'):
    pdf_url = f'{base_url}{pdf_url}'
pdf_url = pdf_url.replace('?download=1', '')

pdf = sc.download_content(pdf_url, silent=True)

td = sc.TestData(canton='JU', url=pdf_url)

content = sc.pdftotext(pdf, page=1)
td.week = sc.find(r'Situation semaine épidémiologique (\d+)', content)
td.year = sc.find(r'Du \d+.* (\d{4})', content)

content = sc.pdftotext(pdf, page=2)
td.total_tests = sc.find(r'Nombre de tests\d?\s+(\d+)', content)
res = re.match(
    r'.*Nombre de tests positifs .*\s+(\d+)\s+\((\d+\.?\d?)%\s?\d?\)', content,
    re.DOTALL | re.MULTILINE)
assert res, 'failed to find number of positive tests and positivity rate'
td.positive_tests = res[1]
td.positivity_rate = res[2]

print(td)
示例#2
0
res = re.search(r".*categories: \[('KW.*)\],", content)
assert res, f'failed to extract weeks, got {res}'
weeks = res[1].split(',')

res = re.search(r".*name: 'Anzahl negativer Tests.?',\s+color: '.*',\s+data: \[(.*)\],", content)
assert res, f'failed to extract negative tests, got {res}'
negative_tests = res[1].split(',')

res = re.search(r".*name: 'Anzahl positiver Tests.?',\s+color: '.*',\s+data: \[(.*)\],", content)
assert res, f'failed to extract positive tests, got {res}'
positive_tests = res[1].split(',')

res = re.search(r".*name: 'Positivitätsrate',\s+color: '.*',\s+data: \[(.*)\],", content)
assert res, f'failed to extract positivtiy rate, got {res}'
positivity_rate = res[1].split(',')

assert len(weeks) == len(negative_tests) == len(positive_tests) == len(positivity_rate), f'Expected same length for weeks {len(weeks)}, neg. tests {len(negative_tests)}, pos. tests {len(positive_tests)}, pos. rate {len(positivity_rate)}'

year = '2020'
for week, neg, pos, rate in zip(weeks, negative_tests, positive_tests, positivity_rate):
    td = sc.TestData(canton='TG', url=url)
    td.week = sc.find(r'KW (\d+)', week)
    if int(td.week) == 1:
        year = '2021'
    td.year = year
    td.positive_tests = int(pos)
    td.negative_tests = int(neg)
    td.positivity_rate = float(rate)
    print(td)
示例#3
0
def create_bs_test_data(date):
    td = sc.TestData(canton='BL', url=main_url)
    td.start_date = date
    td.end_date = date
    return td
示例#4
0
    return None


xls_url = sac.get_ag_xls_url()
xls = sc.xlsdownload(xls_url, silent=True)

year = '2020'
rows = sc.parse_xls(xls,
                    sheet_name='1.4 Labortests',
                    header_row=1,
                    enable_float=True)
for row in rows:
    if not row['Anzahl Tests']:
        continue
    if row['Anzahl Tests'] == 'Anzahl Tests':
        break

    td = sc.TestData(canton='AG', url=xls_url)
    td.week = int(row['Kalenderwoche'])
    if td.week == 1:
        year = '2021'
    td.year = year
    td.positive_tests = get_value_int(row['Positive Tests'])
    td.negative_tests = get_value_int(row['Negative Tests'])
    td.total_tests = int(row['Anzahl Tests'])
    td.positivity_rate = get_value_float(row['Positivitätsrate'])
    td.pcr_positivity_rate = get_value_float(row['F'])
    td.ag_positivity_rate = get_value_float(row['G'])
    if td:
        print(td)
示例#5
0
        content, re.DOTALL)
    if not res:
        res = re.match(
            r'.*Labortes\s?ts\s\(PCR\s-\sund\sS\s?chnelltes\s?ts\s?\)\s+(\d.*\n)?Total\s+\d+\s+\d+\.?\d?\s+(\d+)\s+\d+\.?\d?\s+(\d+)\s',
            content, re.DOTALL)
    assert res, f'PCR tests for week {week1} or {week2} could not be extracted!'
    total_tests1 = res[2]
    total_tests2 = res[3]

    #res = re.match(r'.*Positivit.tsrate\s+\*?\s+\d.*%\s+(\d.*)%\s+(\d.*)%', content, re.DOTALL)
    res = re.match(
        r'.*Positivit.tsrate\s+\*+?\s+\d+\.?\d?%?\s+(\d+\.?\d?)%?\s+(\d+\.?\d?)%?',
        content, re.DOTALL)
    assert res, f'Positivity rate for week {week1} or {week2} could not be extracted!'
    pos_rate1 = res[1]
    pos_rate2 = res[2]

    data = sc.TestData(canton='SO', url=pdf_url)
    data.week = week1
    data.year = year1
    data.total_tests = total_tests1
    data.positivity_rate = pos_rate1
    print(data)

    data = sc.TestData(canton='SO', url=pdf_url)
    data.week = week2
    data.year = year2
    data.total_tests = total_tests2
    data.positivity_rate = pos_rate2
    print(data)
示例#6
0
from datetime import timedelta

# weekly data
bulletin_urls = sbc.get_all_bl_bulletin_urls()
for bulletin_url in bulletin_urls:
    bulletin_content = sc.download(bulletin_url, silent=True)
    soup = BeautifulSoup(bulletin_content, 'html.parser')
    content = soup.find(string=re.compile(r'Per heute .*')).string
    content = sbc.strip_bl_bulletin_numbers(content)

    date = sc.find(r'Per heute \w+, (\d+\. \w+ 20\d{2})', content)
    date = sc.date_from_text(date)
    # previous week
    date = date - timedelta(days=7)

    td = sc.TestData(canton='BL', url=bulletin_url)
    td.week = date.isocalendar()[1]
    td.year = date.year
    td.total_tests = sc.find(r'In der Vorwoche wurden (\d+) PCR-Tests',
                             content)
    td.positivity_rate = sc.find(
        r'von diesen waren (\d+\.?,?\d?) Prozent positiv', content)
    if td.total_tests and td.positivity_rate:
        td.positivity_rate = td.positivity_rate.replace(',', '.')
        print(td)

# daily data
main_url = 'https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/covid-19-faelle-kanton-basel-landschaft/covid-19-bl-tests'
main_content = sc.download(main_url, silent=True)
soup = BeautifulSoup(main_content, 'html.parser')
示例#7
0
from io import StringIO
import scrape_common as sc


def prettify_positivity_rate(positivity_rate):
    if not positivity_rate:
        return None
    return round(10 * float(positivity_rate)) / 10


url = 'https://data.bs.ch/explore/dataset/100094/download/?format=csv&timezone=Europe/Berlin&lang=en&use_labels_for_header=true&csv_separator=%3B'
data = sc.download(url, silent=True)

reader = csv.DictReader(StringIO(data), delimiter=';')
for row in reader:
    td = sc.TestData(canton='BS', url=url)
    td.start_date = row['Datum']
    td.end_date = row['Datum']
    td.positive_tests = row['Positive Tests'] or None
    td.negative_tests = row['Negative Tests'] or None
    td.total_tests = row['Total Tests'] or None
    td.positivity_rate = row['Anteil positive Tests in Prozent'] or None

    td.pcr_positive_tests = row['Positive PCR Tests'] or None
    td.pcr_negative_tests = row['Negative PCR Tests'] or None
    td.pcr_total_tests = row['Total PCR Tests'] or None
    td.pcr_positivity_rate = row['Anteil positive PCR Tests in Prozent'] or None

    td.ag_positive_tests = row['Positive Antigen Schnelltests'] or None
    td.ag_negative_tests = row['Negative Antigen Schnelltests'] or None
    td.ag_total_tests = row['Total Antigen Schnelltests'] or None
示例#8
0
#!/usr/bin/env python3

from bs4 import BeautifulSoup
import re
import scrape_common as sc

html_url = 'https://www.besondere-lage.sites.be.ch/besondere-lage_sites/de/index/corona/index.html'
d = sc.download(html_url, silent=True)

soup = BeautifulSoup(d, 'html.parser')
for t in soup.find_all('table', summary=re.compile(r'.*die Zahl der durchgef.hrten Tests pro.*')):
    headers = [" ".join(cell.stripped_strings) for cell in t.find('tr').find_all('th')]

    for row in [r for r in t.find_all('tr') if r.find_all('td')]:
        td = sc.TestData(canton='BE', url=html_url)
        tot_tests = None

        for col_num, cell in enumerate(row.find_all(['td'])):
            value = " ".join(cell.stripped_strings)
            if value:
                value = re.sub(r'[^\d\.]', '', value)

            if sc.find(r'^(Kalender.*)', headers[col_num]) is not None:
                td.week = value
                td.year = '2020'
            elif sc.find(r'^(Durchge.*Tests)', headers[col_num]):
                td.total_tests = int(value)
            elif sc.find(r'^(davon.*positiv)', headers[col_num]):
                td.positive_tests = int(value)
            elif sc.find(r'^(Positivit.ts.*)', headers[col_num]):
                td.positivity_rate = value
示例#9
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import scrape_common as sc
from scrape_fr_common import get_fr_xls

xls_url, xls, main_url = get_fr_xls()
rows = sc.parse_xls(xls, header_row=0, sheet_name='tests', enable_float=True)

year = '2020'

for row in rows:
    week = row['semaine /Woche']
    if not week:
        continue

    if week == 1:
        year = '2021'

    td = sc.TestData(canton='FR', url=main_url)
    td.week = int(week)
    td.year = year
    td.pcr_total_tests = int(row['Tests PCR'])
    td.pcr_positivity_rate = round(row['Taux/Rate PCR'] * 100)
    td.ag_total_tests = int(row['Tests AG'])
    td.ag_positivity_rate = round(row['Taux/Rate AG'] * 100)
    td.total_tests = td.pcr_total_tests + td.ag_total_tests
    print(td)
示例#10
0
import os

import db_common as dc
import scrape_common as sc

__location__ = dc.get_location()

input_failures = 0

try:
    DATABASE_NAME = os.path.join(__location__, 'data.sqlite')
    conn = sqlite3.connect(DATABASE_NAME)

    i = 0
    for line in sys.stdin:
        td = sc.TestData()
        if td.parse(line.strip()):
            c = conn.cursor()
            try:
                print(td)

                c.execute(
                    '''
                    INSERT INTO data (
                      canton,
                      start_date,
                      end_date,
                      week,
                      year,
                      pcr_positive_tests,
                      pcr_negative_tests,
示例#11
0
import csv
import datetime
from io import StringIO
import scrape_common as sc

csv_url = 'https://www.zg.ch/behoerden/gesundheitsdirektion/statistikfachstelle/daten/themen/result-themen-14-03-07-i2-k4-b1.csv'
d_csv = sc.download(csv_url, silent=True)
"""
"Woche","Geschlecht","Anzahl Fälle","Meta","Type","Content"
2020-05-25,"männlich","151",NA,NA,NA
2020-06-01,"männlich","117",NA,NA,NA
"""

reader = csv.DictReader(StringIO(d_csv), delimiter=',')
data = collections.defaultdict(dict)
for row in reader:
    if row['Woche'] == 'NA':
        continue
    date = sc.date_from_text(row['Woche'])
    if date not in data:
        data[date] = 0
    data[date] += int(row['Anzahl Fälle'])

days = list(data.keys())
for day in days:
    td = sc.TestData(canton='ZG', url=csv_url)
    td.start_date = day.isoformat()
    td.end_date = (day + datetime.timedelta(days=6)).isoformat()
    td.total_tests = data[day]
    print(td)
示例#12
0
#!/usr/bin/env python3

import scrape_common as sc

url = 'https://www.llv.li/files/as/grafik_covid19_tests_pro_kw.xlsx'
xls = sc.xlsdownload(url, silent=True)
rows = sc.parse_xls(xls, header_row=52, sheet_name='gTests_AG')
for row in rows:
    if row['B'] is None:
        # skip the footer line
        continue
    td = sc.TestData(canton='FL', url=url)
    td.week = sc.find(r'KW (\d+)', row['B'])
    td.year = '2020'
    td.negative_tests = row['Negativ']
    td.positive_tests = row['Positiv']
    print(td)
示例#13
0
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
driver.implicitly_wait(5)

url = 'https://infocovid.smc.unige.ch/'
driver.get(url)
elem = driver.find_element_by_link_text('Graphiques')
elem.click()
elem = driver.find_element_by_partial_link_text('Tests')
elem.click()
xls_url = sgc.get_link_from_element(driver, 'save_plot_nombre_tests_data')
assert xls_url, "Couldn't find tests XLS url"

xls = sc.xlsdownload(xls_url, silent=True)
rows = sc.parse_xls(xls, header_row=0, enable_float=True)
for row in rows:
    td = sc.TestData(canton='GE', url=url)
    res = re.search(r'(\d{2})-(\d{2})', row['week_res'])
    assert res, f"failed to extract year and week from {row['week_res']}"
    td.week = int(res[2])
    td.year = f'20{res[1]}'
    td.positive_tests = int(row['positifs'])
    td.negative_tests = int(row['négatifs'])
    td.total_tests = int(row['total'])
    # 2020-02/03 values are empty
    td.positivity_rate = 0
    if row['ratio']:
        td.positivity_rate = float(row['ratio'])
    print(td)
示例#14
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import scrape_common as sc
import scrape_nw_common as snc

url, soup = snc.get_nw_page()

td = sc.TestData(canton='NW', url=url)

item = soup.find(text=re.compile('Anzahl F.lle')).find_parent('p')
assert item, f"Could not find title item in {url}"

date = sc.find(r'Stand: (\d+\. .* 20\d{2})', item.text)
date = sc.date_from_text(date)
td.start_date = date.isoformat()
td.end_date = date.isoformat()

rows = item.find_next('table').findChildren('tr')
for row in rows:
    cols = row.findChildren('td')
    item = cols[0].text
    if re.match(r'Covid-19-Tests innert 24h.*', item, re.I):
        res = re.match(r'(\d+)\s+(\d+\.?\d?)%', cols[1].text)
        assert res
        td.total_tests = res[1]
        td.positivity_rate = res[2]

assert td
print(td)
示例#15
0
#!/usr/bin/env python3

import csv
from io import StringIO
import scrape_common as sc


url = 'https://raw.githubusercontent.com/openZH/covid_19/master/fallzahlen_kanton_zh/COVID19_Anteil_positiver_Test_pro_KW.csv'
data = sc.download(url, silent=True)

reader = csv.DictReader(StringIO(data), delimiter=',')
for row in reader:
    td = sc.TestData(canton='ZH', url=url)
    td.start_date = row['Woche_von']
    td.end_date = row['Woche_bis']
    td.week = row['Kalenderwoche']
    td.positive_tests = int(row['Anzahl_positiv'])
    td.negative_tests = int(row['Anzahl_negativ'])
    td.positivity_rate = float(row['Anteil_positiv'])
    print(td)