import scrape_common as sc base_url = 'https://www.jura.ch' url = f'{base_url}/fr/Autorites/Coronavirus/Chiffres-H-JU/Evolution-des-cas-COVID-19-dans-le-Jura.html' d = sc.download(url, silent=True) d = d.replace(' ', ' ') soup = BeautifulSoup(d, 'html.parser') pdf_url = soup.find('a', title=re.compile(r'.*PDF.*')).get('href') if not pdf_url.startswith('http'): pdf_url = f'{base_url}{pdf_url}' pdf_url = pdf_url.replace('?download=1', '') pdf = sc.download_content(pdf_url, silent=True) td = sc.TestData(canton='JU', url=pdf_url) content = sc.pdftotext(pdf, page=1) td.week = sc.find(r'Situation semaine épidémiologique (\d+)', content) td.year = sc.find(r'Du \d+.* (\d{4})', content) content = sc.pdftotext(pdf, page=2) td.total_tests = sc.find(r'Nombre de tests\d?\s+(\d+)', content) res = re.match( r'.*Nombre de tests positifs .*\s+(\d+)\s+\((\d+\.?\d?)%\s?\d?\)', content, re.DOTALL | re.MULTILINE) assert res, 'failed to find number of positive tests and positivity rate' td.positive_tests = res[1] td.positivity_rate = res[2] print(td)
res = re.search(r".*categories: \[('KW.*)\],", content) assert res, f'failed to extract weeks, got {res}' weeks = res[1].split(',') res = re.search(r".*name: 'Anzahl negativer Tests.?',\s+color: '.*',\s+data: \[(.*)\],", content) assert res, f'failed to extract negative tests, got {res}' negative_tests = res[1].split(',') res = re.search(r".*name: 'Anzahl positiver Tests.?',\s+color: '.*',\s+data: \[(.*)\],", content) assert res, f'failed to extract positive tests, got {res}' positive_tests = res[1].split(',') res = re.search(r".*name: 'Positivitätsrate',\s+color: '.*',\s+data: \[(.*)\],", content) assert res, f'failed to extract positivtiy rate, got {res}' positivity_rate = res[1].split(',') assert len(weeks) == len(negative_tests) == len(positive_tests) == len(positivity_rate), f'Expected same length for weeks {len(weeks)}, neg. tests {len(negative_tests)}, pos. tests {len(positive_tests)}, pos. rate {len(positivity_rate)}' year = '2020' for week, neg, pos, rate in zip(weeks, negative_tests, positive_tests, positivity_rate): td = sc.TestData(canton='TG', url=url) td.week = sc.find(r'KW (\d+)', week) if int(td.week) == 1: year = '2021' td.year = year td.positive_tests = int(pos) td.negative_tests = int(neg) td.positivity_rate = float(rate) print(td)
def create_bs_test_data(date): td = sc.TestData(canton='BL', url=main_url) td.start_date = date td.end_date = date return td
return None xls_url = sac.get_ag_xls_url() xls = sc.xlsdownload(xls_url, silent=True) year = '2020' rows = sc.parse_xls(xls, sheet_name='1.4 Labortests', header_row=1, enable_float=True) for row in rows: if not row['Anzahl Tests']: continue if row['Anzahl Tests'] == 'Anzahl Tests': break td = sc.TestData(canton='AG', url=xls_url) td.week = int(row['Kalenderwoche']) if td.week == 1: year = '2021' td.year = year td.positive_tests = get_value_int(row['Positive Tests']) td.negative_tests = get_value_int(row['Negative Tests']) td.total_tests = int(row['Anzahl Tests']) td.positivity_rate = get_value_float(row['Positivitätsrate']) td.pcr_positivity_rate = get_value_float(row['F']) td.ag_positivity_rate = get_value_float(row['G']) if td: print(td)
content, re.DOTALL) if not res: res = re.match( r'.*Labortes\s?ts\s\(PCR\s-\sund\sS\s?chnelltes\s?ts\s?\)\s+(\d.*\n)?Total\s+\d+\s+\d+\.?\d?\s+(\d+)\s+\d+\.?\d?\s+(\d+)\s', content, re.DOTALL) assert res, f'PCR tests for week {week1} or {week2} could not be extracted!' total_tests1 = res[2] total_tests2 = res[3] #res = re.match(r'.*Positivit.tsrate\s+\*?\s+\d.*%\s+(\d.*)%\s+(\d.*)%', content, re.DOTALL) res = re.match( r'.*Positivit.tsrate\s+\*+?\s+\d+\.?\d?%?\s+(\d+\.?\d?)%?\s+(\d+\.?\d?)%?', content, re.DOTALL) assert res, f'Positivity rate for week {week1} or {week2} could not be extracted!' pos_rate1 = res[1] pos_rate2 = res[2] data = sc.TestData(canton='SO', url=pdf_url) data.week = week1 data.year = year1 data.total_tests = total_tests1 data.positivity_rate = pos_rate1 print(data) data = sc.TestData(canton='SO', url=pdf_url) data.week = week2 data.year = year2 data.total_tests = total_tests2 data.positivity_rate = pos_rate2 print(data)
from datetime import timedelta # weekly data bulletin_urls = sbc.get_all_bl_bulletin_urls() for bulletin_url in bulletin_urls: bulletin_content = sc.download(bulletin_url, silent=True) soup = BeautifulSoup(bulletin_content, 'html.parser') content = soup.find(string=re.compile(r'Per heute .*')).string content = sbc.strip_bl_bulletin_numbers(content) date = sc.find(r'Per heute \w+, (\d+\. \w+ 20\d{2})', content) date = sc.date_from_text(date) # previous week date = date - timedelta(days=7) td = sc.TestData(canton='BL', url=bulletin_url) td.week = date.isocalendar()[1] td.year = date.year td.total_tests = sc.find(r'In der Vorwoche wurden (\d+) PCR-Tests', content) td.positivity_rate = sc.find( r'von diesen waren (\d+\.?,?\d?) Prozent positiv', content) if td.total_tests and td.positivity_rate: td.positivity_rate = td.positivity_rate.replace(',', '.') print(td) # daily data main_url = 'https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/covid-19-faelle-kanton-basel-landschaft/covid-19-bl-tests' main_content = sc.download(main_url, silent=True) soup = BeautifulSoup(main_content, 'html.parser')
from io import StringIO import scrape_common as sc def prettify_positivity_rate(positivity_rate): if not positivity_rate: return None return round(10 * float(positivity_rate)) / 10 url = 'https://data.bs.ch/explore/dataset/100094/download/?format=csv&timezone=Europe/Berlin&lang=en&use_labels_for_header=true&csv_separator=%3B' data = sc.download(url, silent=True) reader = csv.DictReader(StringIO(data), delimiter=';') for row in reader: td = sc.TestData(canton='BS', url=url) td.start_date = row['Datum'] td.end_date = row['Datum'] td.positive_tests = row['Positive Tests'] or None td.negative_tests = row['Negative Tests'] or None td.total_tests = row['Total Tests'] or None td.positivity_rate = row['Anteil positive Tests in Prozent'] or None td.pcr_positive_tests = row['Positive PCR Tests'] or None td.pcr_negative_tests = row['Negative PCR Tests'] or None td.pcr_total_tests = row['Total PCR Tests'] or None td.pcr_positivity_rate = row['Anteil positive PCR Tests in Prozent'] or None td.ag_positive_tests = row['Positive Antigen Schnelltests'] or None td.ag_negative_tests = row['Negative Antigen Schnelltests'] or None td.ag_total_tests = row['Total Antigen Schnelltests'] or None
#!/usr/bin/env python3 from bs4 import BeautifulSoup import re import scrape_common as sc html_url = 'https://www.besondere-lage.sites.be.ch/besondere-lage_sites/de/index/corona/index.html' d = sc.download(html_url, silent=True) soup = BeautifulSoup(d, 'html.parser') for t in soup.find_all('table', summary=re.compile(r'.*die Zahl der durchgef.hrten Tests pro.*')): headers = [" ".join(cell.stripped_strings) for cell in t.find('tr').find_all('th')] for row in [r for r in t.find_all('tr') if r.find_all('td')]: td = sc.TestData(canton='BE', url=html_url) tot_tests = None for col_num, cell in enumerate(row.find_all(['td'])): value = " ".join(cell.stripped_strings) if value: value = re.sub(r'[^\d\.]', '', value) if sc.find(r'^(Kalender.*)', headers[col_num]) is not None: td.week = value td.year = '2020' elif sc.find(r'^(Durchge.*Tests)', headers[col_num]): td.total_tests = int(value) elif sc.find(r'^(davon.*positiv)', headers[col_num]): td.positive_tests = int(value) elif sc.find(r'^(Positivit.ts.*)', headers[col_num]): td.positivity_rate = value
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import scrape_common as sc from scrape_fr_common import get_fr_xls xls_url, xls, main_url = get_fr_xls() rows = sc.parse_xls(xls, header_row=0, sheet_name='tests', enable_float=True) year = '2020' for row in rows: week = row['semaine /Woche'] if not week: continue if week == 1: year = '2021' td = sc.TestData(canton='FR', url=main_url) td.week = int(week) td.year = year td.pcr_total_tests = int(row['Tests PCR']) td.pcr_positivity_rate = round(row['Taux/Rate PCR'] * 100) td.ag_total_tests = int(row['Tests AG']) td.ag_positivity_rate = round(row['Taux/Rate AG'] * 100) td.total_tests = td.pcr_total_tests + td.ag_total_tests print(td)
import os import db_common as dc import scrape_common as sc __location__ = dc.get_location() input_failures = 0 try: DATABASE_NAME = os.path.join(__location__, 'data.sqlite') conn = sqlite3.connect(DATABASE_NAME) i = 0 for line in sys.stdin: td = sc.TestData() if td.parse(line.strip()): c = conn.cursor() try: print(td) c.execute( ''' INSERT INTO data ( canton, start_date, end_date, week, year, pcr_positive_tests, pcr_negative_tests,
import csv import datetime from io import StringIO import scrape_common as sc csv_url = 'https://www.zg.ch/behoerden/gesundheitsdirektion/statistikfachstelle/daten/themen/result-themen-14-03-07-i2-k4-b1.csv' d_csv = sc.download(csv_url, silent=True) """ "Woche","Geschlecht","Anzahl Fälle","Meta","Type","Content" 2020-05-25,"männlich","151",NA,NA,NA 2020-06-01,"männlich","117",NA,NA,NA """ reader = csv.DictReader(StringIO(d_csv), delimiter=',') data = collections.defaultdict(dict) for row in reader: if row['Woche'] == 'NA': continue date = sc.date_from_text(row['Woche']) if date not in data: data[date] = 0 data[date] += int(row['Anzahl Fälle']) days = list(data.keys()) for day in days: td = sc.TestData(canton='ZG', url=csv_url) td.start_date = day.isoformat() td.end_date = (day + datetime.timedelta(days=6)).isoformat() td.total_tests = data[day] print(td)
#!/usr/bin/env python3 import scrape_common as sc url = 'https://www.llv.li/files/as/grafik_covid19_tests_pro_kw.xlsx' xls = sc.xlsdownload(url, silent=True) rows = sc.parse_xls(xls, header_row=52, sheet_name='gTests_AG') for row in rows: if row['B'] is None: # skip the footer line continue td = sc.TestData(canton='FL', url=url) td.week = sc.find(r'KW (\d+)', row['B']) td.year = '2020' td.negative_tests = row['Negativ'] td.positive_tests = row['Positiv'] print(td)
chrome_options = Options() chrome_options.add_argument("--headless") driver = webdriver.Chrome(options=chrome_options) driver.implicitly_wait(5) url = 'https://infocovid.smc.unige.ch/' driver.get(url) elem = driver.find_element_by_link_text('Graphiques') elem.click() elem = driver.find_element_by_partial_link_text('Tests') elem.click() xls_url = sgc.get_link_from_element(driver, 'save_plot_nombre_tests_data') assert xls_url, "Couldn't find tests XLS url" xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls, header_row=0, enable_float=True) for row in rows: td = sc.TestData(canton='GE', url=url) res = re.search(r'(\d{2})-(\d{2})', row['week_res']) assert res, f"failed to extract year and week from {row['week_res']}" td.week = int(res[2]) td.year = f'20{res[1]}' td.positive_tests = int(row['positifs']) td.negative_tests = int(row['négatifs']) td.total_tests = int(row['total']) # 2020-02/03 values are empty td.positivity_rate = 0 if row['ratio']: td.positivity_rate = float(row['ratio']) print(td)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import re import scrape_common as sc import scrape_nw_common as snc url, soup = snc.get_nw_page() td = sc.TestData(canton='NW', url=url) item = soup.find(text=re.compile('Anzahl F.lle')).find_parent('p') assert item, f"Could not find title item in {url}" date = sc.find(r'Stand: (\d+\. .* 20\d{2})', item.text) date = sc.date_from_text(date) td.start_date = date.isoformat() td.end_date = date.isoformat() rows = item.find_next('table').findChildren('tr') for row in rows: cols = row.findChildren('td') item = cols[0].text if re.match(r'Covid-19-Tests innert 24h.*', item, re.I): res = re.match(r'(\d+)\s+(\d+\.?\d?)%', cols[1].text) assert res td.total_tests = res[1] td.positivity_rate = res[2] assert td print(td)
#!/usr/bin/env python3 import csv from io import StringIO import scrape_common as sc url = 'https://raw.githubusercontent.com/openZH/covid_19/master/fallzahlen_kanton_zh/COVID19_Anteil_positiver_Test_pro_KW.csv' data = sc.download(url, silent=True) reader = csv.DictReader(StringIO(data), delimiter=',') for row in reader: td = sc.TestData(canton='ZH', url=url) td.start_date = row['Woche_von'] td.end_date = row['Woche_bis'] td.week = row['Kalenderwoche'] td.positive_tests = int(row['Anzahl_positiv']) td.negative_tests = int(row['Anzahl_negativ']) td.positivity_rate = float(row['Anteil_positiv']) print(td)