Exemplo n.º 1
0
# get main page contents with the content id
url = f'https://sh.ch/CMS/content.jsp?contentid={content_id}&language=DE'
d = sc.jsondownload(url, silent=True)

# and extract the Lagebericht content ids
soup = BeautifulSoup(d['data_post_content'], 'html.parser')
links = soup.find_all('a', text=re.compile(r'Lagebericht'))
content_ids = []
for link in links:
    content_ids.append(link.get('contentid'))

# fetch the PDFs and parse
for content_id in content_ids:
    url = f'https://sh.ch/CMS/content.jsp?contentid={content_id}&language=DE'
    pdf_url = shc.get_sh_url_from_json(url)
    pdf = sc.download_content(pdf_url, silent=True)

    td = sc.TestData(canton='SH', url=pdf_url)

    content = sc.pdftotext(pdf, page=1)
    date = sc.find(r'(\d+\..*\d{4})', content)
    date = sc.date_from_text(date)
    # not explicitly stated
    start_date = date - datetime.timedelta(days=7)
    td.start_date = start_date.isoformat()
    td.end_date = date.isoformat()

    content = sc.pdftotext(pdf, page=13)
    # remove ' separator to simplify pattern matching
    content = re.sub(r'(\d)\’(\d)', r'\1\2', content)
Exemplo n.º 2
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import datetime
import scrape_common as sc
import scrape_sh_common as shc

main_url = 'https://sh.ch/CMS/Webseite/Kanton-Schaffhausen/Beh-rde/Verwaltung/Departement-des-Innern/Gesundheitsamt-3209198-DE.html'
# A JavaScript content loaded from https://sh.ch/CMS/Webseite/Kanton-Schaffhausen/Beh-rde/Verwaltung/Departement-des-Innern/Gesundheitsamt-3209198-DE.html
xls_url = shc.get_sh_url_from_json(
    'https://sh.ch/CMS/content.jsp?contentid=3666465&language=DE')
xls = sc.xlsdownload(xls_url, silent=True)

rows = sc.parse_xls(xls, header_row=0)
is_first = True
for row in rows:
    if not isinstance(row['Datum'], datetime.datetime):
        continue
    if not (row['Positiv']
            or row.search(r'Hospitalisation isoliert\s+bestätigt.*$') or
            row.search(r'Hospitalisiert.*Intensiv.*$') or row['Verstorben']):
        continue

    if not is_first:
        print('-' * 10)
    is_first = False

    print('SH')
    sc.timestamp()
    print('Downloading:', main_url)
    if isinstance(row['Uhrzeit'], datetime.datetime):