#!/usr/bin/env python3 # -*- coding: utf-8 -*- import re import scrape_common as sc import scrape_ge_common as sgc pdf_urls = sgc.get_ge_weekly_pdf_urls() for pdf_url in pdf_urls: pdf = sc.download_content(pdf_url, silent=True) content = sc.pdftotext(pdf, page=1) week_number = sc.find(r'Situation semaine (\d+)', content) year = sc.find(r'au \d+(\w+)? \w+ (\d{4})', content, group=2) pages = int(sc.pdfinfo(pdf)) for page in range(3, pages): content = sc.pdftotext(pdf, page=page) # remove ' separator to simplify pattern matching content = re.sub(r'(\d)\'(\d)', r'\1\2', content) if sc.find(r'(Dynamique et tendances épidémiologiques)', content): weekly_tests = sc.find( r'avec\s(\d+)\stests\s(effectués\s?)?(contre|\.)', content) res = re.match( r'.*taux\sde\spositivité.*\s\(?(\d+\.?\d?)%\)?\s(en|durant).*\d+\.?\d?%', content, re.MULTILINE | re.DOTALL) positivity_rate = None if res: positivity_rate = res[1]
return arrow.get(date_str, 'DD.MM.YYYY', locale='de').datetime.date() base_url = 'https://www.jura.ch' url = f'{base_url}/fr/Autorites/Coronavirus/Infos-Actualite/Statistiques-COVID/Evolution-des-cas-COVID-19-dans-le-Jura.html' d = sc.download(url) d = d.replace(' ', ' ') soup = BeautifulSoup(d, 'html.parser') pdf_url = soup.find('a', title=re.compile(r'Donn.es de vaccination')).get('href') if not pdf_url.startswith('http'): pdf_url = f'{base_url}{pdf_url}' pdf_url = pdf_url.replace('?download=1', '') pdf = sc.download_data(pdf_url) pages = sc.pdfinfo(pdf) vd = sc.VaccinationData(canton='JU', url=pdf_url) content = sc.pdf_to_text(pdf, page=1, raw=True) content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content) res = re.search(r'\d+\.\d+\.\d{4}\s(\d+\.\d+\.\d{4})', content) assert res vd.date = parse_ju_date(res[1]) res = re.search(r'(\d+)\s+Nombre d\'injection', content) assert res vd.total_vaccinations = res[1] content = sc.pdf_to_text(pdf, page=2, raw=True)
#!/usr/bin/env python import re import scrape_common as sc import scrape_vs_common as svc # get the latest weekly PDF url = svc.get_vs_latest_weekly_pdf_url() # fetch the PDF pdf = sc.download_content(url, silent=True) week, year = svc.get_vs_weekly_general_data(pdf) # second last page contains the district data page = int(sc.pdfinfo(pdf)) - 2 content = sc.pdftotext(pdf, page=page, layout=True, rect=[0, 403, 420, 50], fixed=2) # strip everything including the "Anzahl Faelle" column + values def strip_left_number(content): lines = content.split('\n') pos = None for line in lines: res = re.search(r'\s+(\d+) ', line) if res is not None: if pos is None: pos = res.end() else: pos = min(pos, res.end()) new_content = [] for line in lines: