Пример #1
0
def scrape_ag():
    url = 'https://www.ag.ch/media/kanton_aargau/themen_1/coronavirus_1/daten_excel/Covid-19-Daten_Kanton_Aargau.xlsx'
    content = sc.download_content(url)
    xls = xlrd.open_workbook(file_contents=content)
    xls_datemode = xls.datemode
    sheet = xls.sheet_by_name('3. Ansteckungsorte')
    categories = {c: str(sheet.cell_value(1, c) or xlrd.formula.colname(c)) for c in range(1, sheet.ncols, 2)}
    for row in range(56, sheet.nrows):
        date = sheet.cell_value(row, 0)
        if date == '':
            return
        date = xlrd.xldate_as_datetime(date, xls_datemode).date()
        for col, cat in categories.items():
            # or should we use total count?
            count = sheet.cell_value(row, col)
            if count != '':
                count = int(count)
                isd = sc.InfectionSourceData('AG', url)
                isd.date = date.isoformat()
                isd.source = cat
                isd.count = str(count)
                print(isd)
Пример #2
0
import csv
from io import StringIO
import scrape_common as sc
import scrape_gl_common as sgc


def split_whitespace(text):
    if not text:
        return []
    text = re.sub(r'\s\s+', ' ', text)
    return text.split(' ')


# weekly pdf
pdf_url = sgc.get_gl_pdf_url()
pdf = sc.download_content(pdf_url, silent=True)
content = sc.pdftotext(pdf, page=1)
pdf_date = sc.find(r'Stand: (\d{2}\.\d{2}.\d{4})', content)
pdf_date = sc.date_from_text(pdf_date)

number_of_tests = sc.find(r'PCR-Tests\sKanton Glarus\s(\d+\'?\d+)\s',
                          content).replace('\'', '')
is_first = True
if number_of_tests:
    dd = sc.DayData(canton='GL', url=pdf_url)
    dd.datetime = pdf_date
    dd.tested = number_of_tests
    is_first = False
    print(dd)

content = sc.pdftotext(pdf, page=2, raw=True)