def get_sh_url_from_json(url): m = sc.jsondownload(url, silent=True) # 2020-04-24 """ { data_filetype: "xlsx", data_shareInAreaPage: "[]", data_kachellabel: "Fallzahlen Corona Kanton Schaffhausen.xlsx", data_areaPage_repositoryid: "3275", data_custom_author: "Gesundheitsamt Kanton Schaffhausen", data_tagarea: "[]", data_shareInDomain: "[]", data_zielgruppen: "", data_publication_date: "23.04.2020", data_idpath: "/1752/8540/1753/1765/1755/1763/2733/2747/3275/3666465", data_custom_publication_date_date: "23.04.2020", data_shareArticleProfileId: "", data_file_name: "Fallzahlen Corona Kanton Schaffhausen.xlsx", data_author: "MWETT", data_file_copyrights: "", data_custom_publication_timed: "[]", data_published: "published", data_addmodules: "", data_listlabel: "Fallzahlen Corona Kanton Schaffhausen.xlsx", data_tags: "", data_widget_data: "[]", data_filemeta: "{"uploaded":1,"fileName":"d4ffb019-a2ef-4782-87be-0aafb4b43558","key":"TEMPUPLOADFILES","url":"/CMS/get/file/d4ffb019-a2ef-4782-87be-0aafb4b43558","originalname":"Fallzahlen Corona Kanton Schaffhausen.xlsx","fileid":"d4ffb019-a2ef-4782-87be-0aafb4b43558","category":"null","title":"null","filesize":12286}", data_shareInGlobal: "[]", data_verbande: "", data_file_description: "", data_custom_publication_date_time: "09:31", data_galleries: "[]", data_sharepaths: "", data_permalink: "/Webseite/Kanton-Schaffhausen/Beh-rde/Verwaltung/Departement-des-Innern/Gesundheitsamt-3666465-DE.html", data_schlagworte: "", data_approvedpaths: "["/1752/8540/1753/1765/1755/1763/2733/2747/3275/3666465"]", contentid: "3666465", domainid: "1753", contenttypeid: "101", transactiontime: "23.04 09:09", author: "dande", language: "DE", activated_languages: [ "DE" ], sliderimages: [ ], genericimages: { } } """ meta = json.loads(m['data_filemeta']) url = f"https://sh.ch{meta['url']}" return url
import datetime import re from bs4 import BeautifulSoup import scrape_common as sc import scrape_sh_common as shc # extract content_id of main page url = 'https://sh.ch/CMS/Webseite/Kanton-Schaffhausen/Beh-rde/Verwaltung/Departement-des-Innern/Gesundheitsamt-3209198-DE.html' d = sc.download(url, silent=True) content_id = sc.find(r"var contentid = '(\d+)';", d) assert content_id # get main page contents with the content id url = f'https://sh.ch/CMS/content.jsp?contentid={content_id}&language=DE' d = sc.jsondownload(url, silent=True) # and extract the Lagebericht content ids soup = BeautifulSoup(d['data_post_content'], 'html.parser') links = soup.find_all('a', text=re.compile(r'Lagebericht')) content_ids = [] for link in links: content_ids.append(link.get('contentid')) # fetch the PDFs and parse for content_id in content_ids: url = f'https://sh.ch/CMS/content.jsp?contentid={content_id}&language=DE' pdf_url = shc.get_sh_url_from_json(url) pdf = sc.download_content(pdf_url, silent=True) td = sc.TestData(canton='SH', url=pdf_url)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import datetime import json import scrape_common as sc # A JavaScript content loaded from https://sh.ch/CMS/Webseite/Kanton-Schaffhausen/Beh-rde/Verwaltung/Departement-des-Innern/Gesundheitsamt-3209198-DE.html m = sc.jsondownload('https://sh.ch/CMS/content.jsp?contentid=3666465&language=DE', silent=True) # 2020-04-24 """ { data_filetype: "xlsx", data_shareInAreaPage: "[]", data_kachellabel: "Fallzahlen Corona Kanton Schaffhausen.xlsx", data_areaPage_repositoryid: "3275", data_custom_author: "Gesundheitsamt Kanton Schaffhausen", data_tagarea: "[]", data_shareInDomain: "[]", data_zielgruppen: "", data_publication_date: "23.04.2020", data_idpath: "/1752/8540/1753/1765/1755/1763/2733/2747/3275/3666465", data_custom_publication_date_date: "23.04.2020", data_shareArticleProfileId: "", data_file_name: "Fallzahlen Corona Kanton Schaffhausen.xlsx", data_author: "MWETT", data_file_copyrights: "", data_custom_publication_timed: "[]", data_published: "published", data_addmodules: "",
#!/usr/bin/env python3 import datetime import scrape_common as sc json_url = 'https://services1.arcgis.com/YAuo6vcW85VPu7OE/arcgis/rest/services/Fallzahlen_Total_Kanton/FeatureServer/0/query?where=1%3D1&objectIds=&time=&resultType=none&outFields=*&returnHiddenFields=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnDistinctValues=false&cacheHint=false&orderByFields=Eingangs_Datum&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&sqlFormat=standard&f=pjson' data = sc.jsondownload(json_url, silent=True) # 2020-04-02 """ features: [ { attributes: { Eingangs_Datum: 1582675200000, Anzahl_Fälle_total__kumuliert_: 2, Neue_Faelle: 2, Neue_aktive_Fälle: 2, Anzahl_aktive_Fälle_total: 2, Anzahl_Personen_in_Isolation: 0, Anzahl_Personen_in_Quarantäne: 0, Verstorbene: 0, Verstorbene__kumuliert_: 0, Neue_Hospitalisierungen: 0, Hospitalisiert_Total: 0, Neu_Pflege: 0, Hospitalisiert_Pflege: 0, Neu_IPS: 0, Hospialisiert_IPS: 0, Neu_IPS_beatmet: 0, Hospitalisiert_IPS_beatmet: 0, FID: 1