def get_vs_weekly_general_data(pdf): content = sc.pdftotext(pdf, page=1) week = sc.find(r'Epidemiologische Situation Woche (\d+)', content) end_date = sc.find(r'(\d+\.\d+\.\d{4})', content) end_date = sc.date_from_text(end_date) start_date = end_date - datetime.timedelta(days=7) year = start_date.year return week, year
from io import StringIO import scrape_common as sc import scrape_gl_common as sgc def split_whitespace(text): if not text: return [] text = re.sub(r'\s\s+', ' ', text) return text.split(' ') # weekly pdf pdf_url = sgc.get_gl_pdf_url() pdf = sc.download_content(pdf_url, silent=True) content = sc.pdftotext(pdf, page=1) pdf_date = sc.find(r'Stand: (\d{2}\.\d{2}.\d{4})', content) pdf_date = sc.date_from_text(pdf_date) number_of_tests = sc.find(r'PCR-Tests\sKanton Glarus\s(\d+\'?\d+)\s', content).replace('\'', '') is_first = True if number_of_tests: dd = sc.DayData(canton='GL', url=pdf_url) dd.datetime = pdf_date dd.tested = number_of_tests is_first = False print(dd) content = sc.pdftotext(pdf, page=2, raw=True) dates = split_whitespace(
base_url = 'https://www.jura.ch' url = f'{base_url}/fr/Autorites/Coronavirus/Chiffres-H-JU/Evolution-des-cas-COVID-19-dans-le-Jura.html' d = sc.download(url, silent=True) d = d.replace(' ', ' ') soup = BeautifulSoup(d, 'html.parser') pdf_url = soup.find('a', title=re.compile(r'.*PDF.*')).get('href') if not pdf_url.startswith('http'): pdf_url = f'{base_url}{pdf_url}' pdf_url = pdf_url.replace('?download=1', '') pdf = sc.download_content(pdf_url, silent=True) td = sc.TestData(canton='JU', url=pdf_url) content = sc.pdftotext(pdf, page=1) td.week = sc.find(r'Situation semaine épidémiologique (\d+)', content) td.year = sc.find(r'Du \d+.* (\d{4})', content) content = sc.pdftotext(pdf, page=2) td.total_tests = sc.find(r'Nombre de tests\d?\s+(\d+)', content) res = re.match( r'.*Nombre de tests positifs .*\s+(\d+)\s+\((\d+\.?\d?)%\s?\d?\)', content, re.DOTALL | re.MULTILINE) assert res, 'failed to find number of positive tests and positivity rate' td.positive_tests = res[1] td.positivity_rate = res[2] print(td)
def get_vs_weekly_general_data(pdf): content = sc.pdftotext(pdf, page=1) week = sc.find(r'Epidemiologische Situation Woche (\d+)', content) year = sc.find(r'\d+\.\d+\.(\d{4})', content) return week, year
from io import StringIO import scrape_common as sc import scrape_gl_common as sgc def split_whitespace(text): if not text: return [] text = re.sub(r'\s\s+', ' ', text) return text.split(' ') # weekly pdf pdf_url = sgc.get_gl_pdf_url() pdf = sc.download_content(pdf_url, silent=True) content = sc.pdftotext(pdf, page=1) pdf_date = sc.find(r'Stand: (\d{2}\.\d{2}.\d{4})', content) pdf_date = sc.date_from_text(pdf_date) number_of_tests = sc.find(r'PCR-Tests\sKanton Glarus\s(\d+\'?\d+)\s', content).replace('\'', '') is_first = True if number_of_tests: dd = sc.DayData(canton='GL', url=pdf_url) dd.datetime = pdf_date dd.tested = number_of_tests is_first = False print(dd) content = sc.pdftotext(pdf, page=2, layout=True) dates = split_whitespace(
import re import scrape_common as sc import scrape_vs_common as svc # get the latest weekly PDF url = svc.get_vs_latest_weekly_pdf_url() # fetch the PDF pdf = sc.download_content(url, silent=True) week, year = svc.get_vs_weekly_general_data(pdf) # second last page contains the district data page = int(sc.pdfinfo(pdf)) - 2 content = sc.pdftotext(pdf, page=page, layout=True, rect=[0, 403, 420, 50], fixed=2) # strip everything including the "Anzahl Faelle" column + values def strip_left_number(content): lines = content.split('\n') pos = None for line in lines: res = re.search(r'\s+(\d+) ', line) if res is not None: if pos is None: pos = res.end() else: pos = min(pos, res.end()) new_content = [] for line in lines: new_content.append(line[pos:])
import re import scrape_common as sc import scrape_vs_common as svc # get all PDFs for url in svc.get_vs_weekly_pdf_urls(): td = sc.TestData(canton='VS', url=url) pdf = sc.download_content(url, silent=True) td.week, td.year = svc.get_vs_weekly_general_data(pdf) for page in range(4, 6): content = sc.pdftotext(pdf, page=page, raw=True) content = re.sub(r'(\d)\‘(\d)', r'\1\2', content) content = re.sub(r'(\d)\’(\d)', r'\1\2', content) td.total_tests = sc.find(r'Alle\s+Arten\s+von\s+Tests\s+(\d+)', content) td.positivity_rate = sc.find(r'Alle\s+Arten\s+von\s+Tests\s+\d+\s+(\d+\.\d+)%', content) td.pcr_total_tests = sc.find(r'PCR\s+(\d+)', content) td.pcr_positivity_rate = sc.find(r'PCR\s+\d+\s+(\d+\.\d+)%', content) td.ag_total_tests = sc.find(r'Antigentests\s+(\d+)', content) td.ag_positivity_rate = sc.find(r'Antigentests\s+\d+\s+(\d+\.\d+)%', content) if not td.total_tests: continue print(td)