def parse_xlsx(): html_url = 'https://www.vd.ch/toutes-les-actualites/hotline-et-informations-sur-le-coronavirus/point-de-situation-statistique-dans-le-canton-de-vaud/' d = sc.download(html_url, silent=True) soup = BeautifulSoup(d, 'html.parser') xls_url = soup.find(href=re.compile("\.xlsx$")).get('href') assert xls_url, "URL is empty" xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls, header_row=2) is_first = True for row in rows: if not isinstance(row['Date'], datetime.datetime): continue if not is_first: print('-' * 10) is_first = False print('VD') sc.timestamp() print('Downloading:', html_url) print('Date and time:', row['Date'].date().isoformat()) print('Confirmed cases:', row['Nombre total de cas confirmés positifs']) print('Hospitalized:', row['Hospitalisation en cours']) print('ICU:', row['Dont soins intensifs']) print('Deaths:', row['Décès'])
dd.hospitalized = res['hosp'] dd.quarantined = res['quar'] dd.quarantine_riskareatravel = res['qtravel'] print(dd) is_first = False """ try: xls_url = soup.find( 'a', string=re.compile(r'Coronaf.lle\s*im\s*Kanton\s*Schwyz'))['href'] except TypeError: print("Unable to determine xls url", file=sys.stderr) sys.exit(1) xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls) for row in rows: if not isinstance(row['Datum'], datetime.datetime): continue if not is_first: print('-' * 10) is_first = False # TODO: remove when source is fixed # handle wrong value on 2020-03-25, see issue #631 if row['Datum'].date().isoformat() == '2020-03-25': row['Bestätigte Fälle (kumuliert)'] = '' dd = sc.DayData(canton='SZ', url=url) dd.datetime = row['Datum'].date().isoformat()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import datetime import scrape_common as sc xls_url = 'https://raw.githubusercontent.com/statistikZH/covid19_drop/master/Chiffres%20%20COVID-19%20Valais.xlsx' xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls, header_row=1) is_first = True for row in rows: if not isinstance(row['Date'], datetime.datetime): continue if not is_first: print('-' * 10) is_first = False dd = sc.DayData(canton='VS', url=xls_url) dd.datetime = row['Date'].date().isoformat() dd.cases = row['Cumul cas positifs'] dd.hospitalized = row['Total hospitalisations COVID-19'] dd.icu = row['Patients COVID-19 aux SI total'] dd.vent = row['Patients COVID-19 intubés'] dd.deaths = row['Cumul décès COVID-19'] print(dd)
import re import datetime import sys from bs4 import BeautifulSoup import scrape_common as sc d = sc.download('https://www.fr.ch/sante/covid-19/coronavirus-statistiques-evolution-de-la-situation-dans-le-canton', silent=True) soup = BeautifulSoup(d, 'html.parser') xls_url = soup.find(href=re.compile("\.xlsx$")).get('href') assert xls_url, "URL is empty" if not xls_url.startswith('http'): xls_url = f'https://www.fr.ch{xls_url}' xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls, header_row=0, sheet_name='Données sites internet') is_first = True for row in rows: if not isinstance(row['Date'], datetime.datetime): print(f"WARNING: {row['Date']} is not a valid date, skipping.", file=sys.stderr) continue if not is_first: print('-' * 10) is_first = False print('FR') sc.timestamp() print('Downloading:', xls_url) print('Date and time:', row['Date'].date().isoformat()) print('Confirmed cases:', row['Total cas avérés'])
'Jura bernois': 53721, 'Biel/Bienne': 101313, 'Seeland': 74467, 'Oberaargau': 81759, 'Emmental': 97218, 'Bern-Mittelland': 414658, 'Thun': 107491, 'Obersimmental-Saanen': 16588, 'Frutigen-Niedersimmental': 40375, 'Interlaken-Oberhasli': 47387, } # fetch communes / cities of BE xls_url = 'https://www.jgk.be.ch/jgk/de/index/gemeinden/gemeinden/gemeindedaten.assetref/dam/documents/JGK/AGR/de/Gemeinden/Gemeindedaten/agr_gemeinden_gemeindedaten_gemeinden_rk_de.xlsx' xls = sc.xlsdownload(xls_url, silent=True) xls_data = sc.parse_xls(xls, header_row=1, columns_to_parse=9) communes = {} for item in xls_data: commune = item['Gemeinde / Commune'] # kind of expected in this context commune = commune.replace(' (BE)', '') commune = commune.replace(' BE', '') district = item['Verwaltungskreis / Arrondissement administratif'] communes[commune] = district assert district in district_ids, f'District {district} is unknown!' # start getting and parsing the data html_url = 'https://www.besondere-lage.sites.be.ch/besondere-lage_sites/de/index/corona/index.html' d = sc.download(html_url, silent=True) d = d.replace(' ', ' ') soup = BeautifulSoup(d, 'html.parser')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import scrape_common as sc from scrape_fr_common import get_fr_xls xls_url, xls = get_fr_xls() rows = sc.parse_xls(xls, header_row=0, sheet_name='tests COVID19', enable_float=True) for row in rows: td = sc.TestData(canton='FR', url=xls_url) td.week = sc.find(r'S (\d+)', row['Semaine']) td.year = '2020' tot = int(row['Total Testing Pop FR']) pos = int(row['Total POS Pop FR']) td.positive_tests = pos td.total_tests = tot print(td)
return int(value) return None def get_value_float(value): if value is not None and value != '': return float(value) return None xls_url = sac.get_ag_xls_url() xls = sc.xlsdownload(xls_url, silent=True) year = '2020' rows = sc.parse_xls(xls, sheet_name='1.4 Labortests', header_row=1, enable_float=True) for row in rows: if not row['Anzahl Tests']: continue if row['Anzahl Tests'] == 'Anzahl Tests': break td = sc.TestData(canton='AG', url=xls_url) td.week = int(row['Kalenderwoche']) if td.week == 1: year = '2021' td.year = year td.positive_tests = get_value_int(row['Positive Tests']) td.negative_tests = get_value_int(row['Negative Tests']) td.total_tests = int(row['Anzahl Tests'])
#!/usr/bin/env python3 import datetime import scrape_common as sc import scrape_ag_common as sac xls_url = sac.get_ag_xls_url() xls = sc.xlsdownload(xls_url, silent=True) is_first = True # quarantine_riskareatravel rows = sc.parse_xls(xls, sheet_name='5. Quarantäne nach Einreise', header_row=2) for row in rows: if not isinstance(row['A'], datetime.datetime): continue dd = sc.DayData(canton='AG', url=xls_url) dd.datetime = f"{row['A'].date().isoformat()} {row['A'].time().isoformat()}" dd.quarantine_riskareatravel = row['Gesamtzahl aktuell betreuter Personen'] if dd: if not is_first: print('-' * 10) is_first = False print(dd) # quarantine + isolation rows = sc.parse_xls(xls, sheet_name='2. Contact Tracing', header_row=2) for row in rows: if not isinstance(row['A'], datetime.datetime):
dd_test = sc.DayData(canton='GE', url=pdf_url) dd_test.datetime = week_end_date.isoformat() dd_test.tested = number_of_tests print(dd_test) # xls d = sc.download('https://www.ge.ch/document/covid-19-donnees-completes-debut-pandemie', silent=True) soup = BeautifulSoup(d, 'html.parser') xls_url = soup.find(title=re.compile("\.xlsx$")).get('href') assert xls_url, "xls URL is empty" if not xls_url.startswith('http'): xls_url = f'https://www.ge.ch{xls_url}' xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls, header_row=0, skip_rows=2) for i, row in enumerate(rows): if not isinstance(row['Date'], datetime.datetime): print(f"WARNING: {row['Date']} is not a valid date, skipping.", file=sys.stderr) continue if row['Nombre cas COVID-19'] is None: print(f"WARNING: 'Nombre cas COVID-19' is empty on {row['Date']}, skipping.", file=sys.stderr) continue print('-' * 10) is_first = False # TODO: remove when source is fixed # handle wrong value on 2020-04-09, see issue #819 if row['Date'].date().isoformat() == '2020-04-09':
#!/usr/bin/env python3 import scrape_common as sc url = 'https://www.llv.li/files/as/grafik_covid19_tests_pro_kw.xlsx' xls = sc.xlsdownload(url, silent=True) rows = sc.parse_xls(xls, header_row=52, sheet_name='gTests_AG') for row in rows: if row['B'] is None: # skip the footer line continue td = sc.TestData(canton='FL', url=url) td.week = sc.find(r'KW (\d+)', row['B']) td.year = '2020' td.negative_tests = row['Negativ'] td.positive_tests = row['Positiv'] print(td)
chrome_options = Options() chrome_options.add_argument("--headless") driver = webdriver.Chrome(options=chrome_options) driver.implicitly_wait(5) url = 'https://infocovid.smc.unige.ch/' driver.get(url) elem = driver.find_element_by_link_text('Graphiques') elem.click() elem = driver.find_element_by_partial_link_text('Tests') elem.click() xls_url = sgc.get_link_from_element(driver, 'save_plot_nombre_tests_data') assert xls_url, "Couldn't find tests XLS url" xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls, header_row=0, enable_float=True) for row in rows: td = sc.TestData(canton='GE', url=url) res = re.search(r'(\d{2})-(\d{2})', row['week_res']) assert res, f"failed to extract year and week from {row['week_res']}" td.week = int(res[2]) td.year = f'20{res[1]}' td.positive_tests = int(row['positifs']) td.negative_tests = int(row['négatifs']) td.total_tests = int(row['total']) # 2020-02/03 values are empty td.positivity_rate = 0 if row['ratio']: td.positivity_rate = float(row['ratio']) print(td)
import re import datetime import scrape_common as sc data_url = 'https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp' d = sc.download(data_url, silent=True) soup = BeautifulSoup(d, 'html.parser') xls_url = soup.find('a', href=re.compile(r'\.xlsx$'))['href'] if not xls_url.startswith('http'): xls_url = f'https://www.ag.ch{xls_url}' xls = sc.xlsdownload(xls_url, silent=True) is_first = True # quarantine + isolation rows = sc.parse_xls(xls, sheet_name='2. Contact Tracing', header_row=2) for row in rows: if not isinstance(row['A'], datetime.datetime): continue dd = sc.DayData(canton='AG', url=xls_url) dd.datetime = f"{row['A'].date().isoformat()} {row['A'].time().isoformat()}" dd.isolated = row['Gesamtzahl aktuell betreuter Personen'] dd.quarantined = row['Gesamtzahl aktuell betreuter Personen5'] if dd: if not is_first: print('-' * 10) is_first = False print(dd) # cases + hospitalization