assert district in district_ids, f'District {district} is unknown!' # start getting and parsing the data html_url = 'https://www.besondere-lage.sites.be.ch/besondere-lage_sites/de/index/corona/index.html' d = sc.download(html_url, silent=True) d = d.replace(' ', ' ') soup = BeautifulSoup(d, 'html.parser') tbody = soup.find('table', {'summary': 'Laufend aktualisierte Zahlen zu den Corona-Erkrankungen im Kanton Bern'}).find_next('tbody') for row in tbody.find_all('tr'): tds = row.find_all('td') date_str = sc.find(r'(\d+\.\d+\.\d+)', tds[0].text) date = sc.date_from_text(date_str) dds = {} for (district, d_id), (district, population) in zip(district_ids.items(), inhabitants.items()): dd = sc.DistrictData(district=district, canton='BE') dd.url = html_url dd.district_id = d_id dd.population = population dd.date = date.isoformat() dd.new_cases = 0 dds[district] = dd content = tds[2].text.strip() # fix Munchen-<br />\nbuchsee stuff content = re.sub(r'-\n(\w)', r'-\1', content) # fix <br /> without - from above, but no number on the next line... content = re.sub(r'\n([A-Z|a-z])', r' \1', content) for item in content.split('\n'): res = re.match(r'(\d+) (.*)', item) assert res is not None, f'Unexpected item {item} for number / city'
for header in trs[0]: week = sc.find(r'Woche (\d+)', header.string) if week is not None: weeks.append(week) if int(week) > 50: years.append('2020') else: years.append('2021') for tr in trs[1:]: tds = tr.find_all('td') for i in range(len(weeks)): district = tds[0].string if district in inhabitants: dd = sc.DistrictData(canton='FR', district=district) dd.url = url dd.week = weeks[i] # TODO restore once all weeks are in 2021 # dd.year = '20' + year dd.year = years[i] dd.new_cases = tds[i + 1].string dd.population = inhabitants[district] dd.district_id = district_ids[district] print(dd) # daily data from xls xls_url, xls, main_url = get_fr_xls() rows = sc.parse_xls(xls, header_row=0) for row in rows:
initial_cases = { 'Arlesheim': 528, 'Laufen': 65, 'Liestal': 177, 'Sissach': 81, 'Waldenburg': 15, } # order dict by key to ensure the most recent entry is last ordered_rows = OrderedDict(sorted(rows.items())) #for row_date, row in ordered_rows.items(): # for district, district_id in district_ids.items(): for district, district_id in district_ids.items(): last_total_cases_val = initial_cases[district] if district == 'Arlesheim': # 2020-05-31 is 527 last_total_cases_val = 527 for row_date, row in ordered_rows.items(): dd = sc.DistrictData(canton='BL', district=district) dd.district_id = district_id dd.population = population[district] dd.url = main_url dd.date = row['date'] dd.total_cases = row[district] + initial_cases[district] dd.new_cases = dd.total_cases - last_total_cases_val last_total_cases_val = dd.total_cases print(dd)
import csv from io import StringIO import requests import scrape_common as sc # perma link to TG COVID dataset on opendata.swiss r = requests.get( 'https://opendata.swiss/api/3/action/ogdch_dataset_by_identifier', params={'identifier': 'gesundheit_04-2020_stat@kanton-thurgau'}) dataset = r.json()['result'] resource = next( r for r in dataset['resources'] if r['name']['de'] == 'COVID19 Fallzahlen Kanton Thurgau auf Ebene Bezirk') assert resource['download_url'], "Download URL not found" d_csv = sc.download(resource['download_url'], silent=True, encoding='latin1') reader = csv.DictReader(StringIO(d_csv), delimiter=';') for row in reader: dd = sc.DistrictData(canton='TG') dd.district_id = row['DistrictId'] dd.district = row['District'] dd.population = row['Population'] dd.week = row['Week'] dd.year = row['Year'] dd.new_cases = row['NewConfCases'] dd.url = resource['download_url'] print(dd)
population = [ 4440, 10930, 26910, 28650, 12360, 49230, 10860, 47750, 28910, 47980, 15260, 13830, 46840, ] assert len(district_values) == 13, f'expected 13 district values, but got {len(district_values)} for {url}' i = 0 for value in district_values: dd = sc.DistrictData(canton='VS', district=districts[i]) dd.url = url dd.district_id = district_ids[i] dd.population = population[i] dd.week = week dd.year = year dd.new_cases = value print(dd) i += 1
241: 'Jura bernois', 242: 'Biel/Bienne', 243: 'Seeland', 244: 'Oberaargau', 245: 'Emmental', 246: 'Bern-Mittelland', 247: 'Thun', 248: 'Obersimmental-Saanen', 249: 'Frutigen-Niedersimmental', 250: 'Interlaken-Oberhasli', } url = 'https://covid-kennzahlen.apps.be.ch/#/de/cockpit' csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/7_d_inzidenz_verwaltungskreis.csv' d = sc.download(csv_url, silent=True) reader = csv.DictReader(StringIO(d), delimiter=',') for row in reader: #dd = sc.DistrictData(district=district, canton='BE') district_id = int(row['bfs_nummer']) dd = sc.DistrictData(district=district_ids[district_id], canton='BE') dd.url = url dd.district_id = district_id dd.population = row['einwohnerzahl'] date = sc.date_from_text(row['datum']) week = date.isocalendar()[1] dd.week = week dd.year = date.year dd.new_cases = round( float(row['7_d_inzidenz']) / 100e3 * int(row['einwohnerzahl'])) print(dd)
'Rheintal': 1723, 'Werdenberg': 1724, 'Sarganserland': 1725, 'See-Gaster': 1726, 'Toggenburg': 1727, 'Wil': 1728, } url = 'https://www.sg.ch/ueber-den-kanton-st-gallen/statistik/covid-19/_jcr_content/Par/sgch_downloadlist/DownloadListPar/sgch_download.ocFile/KantonSG_C19-Faelle_download.csv' d = sc.download(url, silent=True) # strip the "header" / description lines d = "\n".join(d.split("\n")[5:]) reader = csv.DictReader(StringIO(d), delimiter=';') for row in reader: week = sc.find(r'W(\d+)', row['Kalenderwoche']) date = sc.date_from_text(row['Falldatum']) for key, value in inhabitants.items(): dd = sc.DistrictData(canton='SG', district=key) dd.url = url dd.week = week dd.year = date.year dd.date = date.isoformat() dd.district_id = district_ids[key] dd.new_cases = row['Wahlkreis ' + key] dd.total_cases = row['Wahlkreis ' + key + ' (kumuliert)'] dd.population = value print(dd)
'Plessur': 1848, 'Prättigau/Davos': 1849, 'Surselva': 1850, 'Viamala': 1851, } limit = '100' url = 'https://services1.arcgis.com/YAuo6vcW85VPu7OE/arcgis/rest/services/Fallzahlen_Pro_Region/FeatureServer/0/query?f=json&where=Datum%3E%3Dtimestamp%20%272020-02-01%2000%3A00%3A00%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&orderByFields=Region%20asc&resultOffset=0&resultRecordCount=10000&resultType=standard&cacheHint=true' resp = requests.get(url=url) json_data = resp.json() print(sc.DistrictData.header()) for attributes in json_data['features']: element = attributes['attributes'] dd = sc.DistrictData(canton='GR', district=element['Region']) dd.url = url date = datetime.datetime.utcfromtimestamp(element['Datum'] / 1000) dd.date = date.date().isoformat() dd.total_cases = element['Faelle__kumuliert_'] dd.new_cases = element['Neue_Faelle'] dd.total_deceased = element['Verstorbene__kumuliert_'] dd.new_deceased = element['Verstorbene'] if dd.district in inhabitants: dd.population = inhabitants[dd.district] if dd.district in district_ids: dd.district_id = district_ids[dd.district] print(dd)
import os import db_common as dc import scrape_common as sc __location__ = dc.get_location() input_failures = 0 try: DATABASE_NAME = os.path.join(__location__, 'data.sqlite') conn = sqlite3.connect(DATABASE_NAME) i = 0 for line in sys.stdin: dd = sc.DistrictData() if dd.parse(line.strip()): c = conn.cursor() try: print(dd) c.execute( ''' INSERT INTO data ( DistrictId, District, Canton, Date, Week, Year, Population,
#cv2.waitKey(0) custom_config = '--oem 3 --psm 6' text_in_img = pytesseract.image_to_string(img, config=custom_config) # delete the temp img file os.remove(path) def parse_line(line): in_str = "OBFT" out_str = "0877" tab = str.maketrans(in_str, out_str) match = re.match(r'^(.*)\s+(?:[_-]\s+)?(\S+)\s+(\S+)\s+(\S+)$', line) if match: return (int(match[3].replace("'", "").translate(tab)), int(match[4].translate(tab))) return (None, None) for name, config in districts.items(): for line in text_in_img.split('\n'): dd = sc.DistrictData(canton='AG', district=name) dd.district_id = config['district_id'] dd.url = data_url if re.search(config['pattern'], line, flags=re.I): population, total_cases = parse_line(line) assert population == config['population'], f"Population number for {name} does not match, {population} != {config['population']}" dd.date = img_date.date().isoformat() dd.population = population dd.total_cases = total_cases break assert dd, f"No data found for district {name}, Text: {text_in_img}" print(dd)