예제 #1
0
def parse_v2(parses, html):
    table = fetchhelper.text_table(html.find('table'))
    ths = table[0]
    assert ('Bundesland' in ths[0])
    assert ('gesamt' in ths[-1])
    trs = table[1:]
    assert ('tigte' in trs[0][0])
    assert ('Todesf' in trs[1][0])
    assert ('Genesen' in trs[2][0])
    assert ('Hospital' in trs[3][0])
    assert ('Intensiv' in trs[4][0])
    assert ('Testungen' in trs[5][0])
    parse = [
        fetchhelper.ParseData(update, 'confirmed'),
        fetchhelper.ParseData(update, 'deaths'),
        fetchhelper.ParseData(update, 'recovered'),
        fetchhelper.ParseData(update, 'hospital'),
        fetchhelper.ParseData(update, 'intensivecare'),
        fetchhelper.ParseData(update, 'tests'),
    ]
    labels = [
        'confirmed', 'deceased', 'recovered', 'hospital', 'intensivecare',
        'tests'
    ]

    areas = {
        'Bgld.': 'Burgenland',
        'Kt.': 'Kärnten',
        'Ktn.': 'Kärnten',
        'NÖ': 'Niederösterreich',
        'OÖ': 'Oberösterreich',
        'Sbg.': 'Salzburg',
        'Stmk.': 'Steiermark',
        'T': 'Tirol',
        'Vbg.': 'Vorarlberg',
        'W': 'Wien'
    }

    for i, tds in enumerate(trs):
        assert (len(ths) == len(tds))
        mo = re.search(r'Stand (\d\d.\d\d.\d\d\d\d), *(\d\d:\d\d) ?Uhr',
                       tds[0])
        if mo is None:
            print("cannot parse date")
            sys.exit(1)
        parse = fetchhelper.ParseData(update, labels[i])
        datadate = parse.parsedtime = datetime.strptime(
            mo.group(1) + ' ' + mo.group(2),
            '%d.%m.%Y %H:%M').replace(tzinfo=datatz)
        with open(parse.parsedfile, 'w') as f:
            cw = csv.writer(f)
            cw.writerow(['Area', 'Date', 'Value'])
            for col in range(1, len(tds) - 1):
                area = areas[strip_footnote(ths[col])]
                count = cleannum(tds[col])
                cw.writerow([area, datadate.isoformat(), count])
        parse.deploy_timestamp()
        parses.append(parse)
예제 #2
0
파일: fetch.py 프로젝트: sth/covid-19-data
    return int(s.replace('.', '').replace('*', ''))

area_map = {
        'Garching': 'Garching b. München',
        'Kirchheim': 'Kirchheim b. München',
        'Pullach': 'Pullach im Isartal',
    }

def canonical_area(area):
    area = area.replace('*', '')
    return area_map.get(area, area)


title = html.find(text=re.compile('Fallzahlen nach Gemeinden')).find_parent('h2')

rows = fetchhelper.text_table(title.find_next_sibling('table'))

assert(len(rows[0]) == 2 or len(rows[0]) == 3)
if rows[0][0] == 'Kommune':
    rows = rows[1:]

with open(parse.parsedfile, 'w') as outf:
    cout = csv.writer(outf)
    header = ('Kommune', 'Timestamp', 'Confirmed')
    cout.writerow(header)
    for tds in rows:
        if not tds[0].strip() and not tds[1].strip():
            continue
        cout.writerow((canonical_area(tds[0]), datatime.isoformat(), clean_num(tds[1])))

parse.deploy_timestamp()
예제 #3
0
else:
    try:
        parse.parsedtime = datetime.datetime.strptime(
            mo.group(1), '%d. %m %Y, %H.%M').replace(tzinfo=datatz)
    except ValueError:
        parse.parsedtime = datetime.datetime.strptime(
            mo.group(1), '%d. %m %Y, %H:%M').replace(tzinfo=datatz)

tab = header.find_parent('table')
if tab is None:
    print("couldn't find table", file=sys.stderr)
    exit(1)

with open(parse.parsedfile, 'w') as outf:
    cout = csv.writer(outf)
    rows = fetchhelper.text_table(tab)
    ths = rows[0]
    assert ('Landkreis' in ths[0])
    assert ('Gesamt' in ''.join(rows[-1])
            or 'Nordrhein-Westfalen' in ''.join(rows[-1]))
    rows = rows[1:-1]

    assert (len(ths) == 5)
    colnum = len(ths)
    assert ('Bestätigte' in ths[1])
    assert ('Todesfälle' in ths[2])
    assert ('Genesene' in ths[3])
    assert ('Inzidenz' in ths[4])
    cn_deaths = 2
    cn_recovered = 3
    cout.writerow(['Area', 'Date', 'EConfirmed', 'EDeaths', 'Recovered'])
예제 #4
0
parse = fetchhelper.ParseData(update, 'data')
# page claims updates are at 16:30 and shortly before midnight
if datatime.time() < datetime.time(hour=16):
    parse.parsedtime = (datatime - datetime.timedelta(day=1)).replace(
        hour=23, minute=50)
elif datatime.time() < datetime.time(hour=23):
    parse.parsedtime = datatime.replace(hour=16, minute=30)
else:
    parse.parsedtime = datatime

txt = html.find(text=re.compile('Statistik nach Gemeinden'))
if not txt:
    print("iframe content doesn't look correct", file=sys.stderr)
    sys.exit(1)

rows = fetchhelper.text_table(html)

# The structure of the document is currently a mess. Let's wait if it improves in the future.
for row in rows:
    if row[0] == '':
        del row[0]

headers = []
while rows[0][0] != 'Altomünster':
    headers.append(rows[0])
    del rows[0]

footers = []
while rows[-1][0] != 'Weichs':
    footers.insert(0, rows[-1])
    del rows[-1]
예제 #5
0
datatz = dateutil.tz.gettz('Europe/Vienna')

update = fetchhelper.Updater(
    'https://www.sozialministerium.at/Informationen-zum-Coronavirus/Neuartiges-Coronavirus-(2019-nCov).html'
)
update.check_fetch(rawfile=args.rawfile)

html = BeautifulSoup(update.rawdata, 'html.parser')


def strip_footnote(s):
    return s.rstrip('*')


table = fetchhelper.text_table(html.find('table'))
# on 10.09.2020 there were additional empty cells
for tr in table:
    if tr[-1] == '':
        tr.pop()
ths = table[0]
assert ('Bundesland' in ths[0])
assert ('gesamt' in ths[-1])
trs = table[1:]
assert ('tigte' in trs[0][0])
assert ('Todesf' in trs[1][0])
assert ('Genesen' in trs[2][0])
assert ('Hospital' in trs[3][0])
assert ('Intensiv' in trs[4][0])
assert ('Testungen' in trs[5][0])
parse = [