Пример #1
def parse_weekly_data(filename):
    txt = c.pdf_to_text(filename)
    txt = re.sub(r'(\d)\s(\d)', r'\1\2', txt)

    year = c.search(r'Stand:\s\d+\.\d+\.(\d{4})', txt)
    week = int(c.search(r'Liechtenstein - Woche (\d+) ', txt))

    tot_tests = ''
    tot_antigen_tests = ''
    pcr_pos = txt.find('Gemeldete Tests')
    if pcr_pos > 0:
        pcr_pos = txt.find('PCR', pcr_pos)
        pcr_end_pos = txt.find('\n', pcr_pos)
        assert pcr_end_pos > pcr_pos
        line = txt[pcr_pos:pcr_end_pos]

        #line = re.sub(r'(\d)\s(\d)', r'\1\2', line)
        line = re.sub(r'\s+', r' ', line)
        tot_tests = c.txt_to_int(line.split(' ')[-2])

        # Antigen tests
        pcr_pos = txt.find('Antigen-Schnelltests', pcr_pos)
        pcr_end_pos = txt.find('\n', pcr_pos)
        assert pcr_end_pos > pcr_pos
        line = txt[pcr_pos:pcr_end_pos]
        line = re.sub(r'\s+', r' ', line)
        tot_antigen_tests = c.txt_to_int(line.split(' ')[-2])

    positivity_rate = ''
    antigen_positivity_rate = ''
    positivity_pos = txt.find('\nPositivit')
    if positivity_pos == -1:
        positivity_pos = txt.find('\nAnteil positiver Tests')
    if positivity_pos > 0:
        positivity_pos = txt.find('PCR', positivity_pos)
        positivity_end_pos = txt.find('\n', positivity_pos)
        assert positivity_end_pos > positivity_pos
        line = txt[positivity_pos:positivity_end_pos]
        line = re.sub(r'\s+', r' ', line)
        positivity_rate = line.split(' ')[-1]
        positivity_rate = c.txt_to_float(positivity_rate.replace('%', ''))

        # Antigen tests
        positivity_pos = txt.find('Antigen-Schnelltest', positivity_pos)
        positivity_end_pos = txt.find('\n', positivity_pos)
        assert positivity_end_pos > positivity_pos
        line = txt[positivity_pos:positivity_end_pos]
        line = re.sub(r'\s+', r' ', line)
        antigen_positivity_rate = line.split(' ')[-1]
            antigen_positivity_rate = c.txt_to_float(
                positivity_rate.replace('%', ''))

Пример #2
def parse_data(filename):
    txt = c.pdf_to_text(filename)
    date_time = c.search(r'Stand (\d.*) Uhr', txt)
    if date_time is None:
        date = c.search(r'Stand\: (\d{2}\.\d{2}\.20\d{2})', txt)
        time = c.search(r'Zeit: (\d+:\d{2})', txt)
        if date is not None and time is not None:
            date_time = '{} {}'.format(date, time)
    date = c.parse_date(date_time)

    tot_tests = parse_pcr_tot_tests(txt)

    positivity_rate = c.txt_to_float(
        c.search(r'Bei (\d+)% dieser Tests fiel das Resultat positiv aus',
    if positivity_rate is None:
        positivity_rate = c.txt_to_float(
            c.search(r'Positivit.tsrate( \*+| \(%\)|\*+)?\s+(\d\.?\d?)[%\s]',
    if positivity_rate is None:
        positivity_rate = c.txt_to_float(
            c.search(r'Anteil positive Tests \(%\)(\d)?\s+(\d\.?\d?)[%\s]',

    isolated = c.txt_to_int(
            r'(\d+)\s+(F.lle|Personen aufgrund einer laborbest.tigten COVID-19 Erkrankung)? in\sIsolation',
    quarantined = c.txt_to_int(
            r'(\d+)\s?(in|Kontaktpersonen\sin\s.rztlich\sverordneter)? Quarant.ne',
    quarantined_travel = None
    if isolated is None or quarantined is None:
        pos = txt.find('Contact Tracing')
        if pos > 0:
            pcr = re.compile(
                r'Total\s?(\*+|\(%\))?\s+(\d+\s?\d+|\d+)\s+(\d+\s?\d+|\d+)\s+(\d+ ?\d+|\d+)?\n'
            #pcr = re.compile(r'Total\s?(\*+|\(%\))?\s+(\d+)\s+(\d+)\s+(\d+|\d+\s?\d+)?')
            res = pcr.search(txt, pos)
            if res is not None:
                isolated = c.txt_to_int(res[2])
                quarantined = c.txt_to_int(res[3])
                quarantined_travel = c.txt_to_int(res[4].strip())

    print('{},{},{},{},{},{},{}'.format(date, tot_tests or '', positivity_rate
                                        or '', isolated or '', quarantined
                                        or '', quarantined_travel or '',
Пример #3
def parse_weekly_data(filename):
    txt = c.pdf_to_text(filename)

    week = c.search(r'Liechtenstein - Woche (\d+) ', txt)

    tot_tests = None
    pcr_pos = txt.find('PCR-Tests')
    if pcr_pos > 0:
        pcr_pos = txt.find('\n', pcr_pos) + 1
        pcr_end_pos = txt.find('\n', pcr_pos)
        assert pcr_end_pos > pcr_pos
        line = txt[pcr_pos:pcr_end_pos]

        line = re.sub(r'(\d)\s(\d)', r'\1\2', line)
        line = re.sub(r'\s+', r' ', line)
        tot_tests = c.txt_to_int(line.split(' ')[-2])

    positivity_rate = None
    positivity_pos = txt.find('\nPositivit')
    if positivity_pos == -1:
        positivity_pos = txt.find('\nAnteil positive Tests')
    if positivity_pos > 0:
        positivity_pos += 1
        positivity_end_pos = txt.find('\n', positivity_pos)
        assert positivity_end_pos > positivity_pos
        line = txt[positivity_pos:positivity_end_pos]
        line = re.sub(r'\s+', r' ', line)
        positivity_rate = line.split(' ')[-1]
        positivity_rate = c.txt_to_float(positivity_rate.replace('%', ''))

    print('{},{},{},{}'.format(week, tot_tests or '', positivity_rate or '',
Пример #4
def parse_canton_data(canton, filename):
    txt = c.pdf_to_text(filename)

    # pylint: disable=W0105
    Coronavirus-Krankheit-2019 (COVID-19)
    Eidgen<C3><B6>ssisches Departement des Innern EDI
    Bundesamt f<C3><BC>r Gesundheit BAG
    Direktionsbereich <C3><96>ffentliche Gesundheit
    Situationsbericht zur epidemiologischen Lage in der Schweiz
    und im F<C3><BC>rstentum Liechtenstein - Woche 28 (06.-12.07.2020)

    year = c.search(r'Stand:\s\d+\.\d+\.(\d{4})', txt)
    week = int(c.search(r'Liechtenstein - Woche (\d+)', txt))

    Canton, tests of previous-week then current-week

    AG 5478 3588 808 529 1.3 1.8
    AI 96 55 595 341 0.0 0.0
    AR 391 249 708 451 0.5 1.2
    BE 6924 4652 669 449 0.4 0.9
    start = txt.find('Anzahl PCR-Tests in der Schweiz')
    if start == -1:
        start = txt.find('Anzahl durchgeführte PCR-Tests in der Schweiz')
    if start == -1:
        start = txt.find('Anzahl durchgeführte Tests in der Schweiz')
    if start == -1:
        start = txt.find('Anzahl gemeldeter Tests, Anzahl Tests pro')
    if start > 0:
        start = txt.find(r' AG ', start)
        start = 0
    end = txt.find('Tabelle 4. Durchgeführte Tests nach Kalenderwoche', start)
    if end == -1:
        end = txt.find('Die Altersverteilung der', start)
    if end == -1:
        end = txt.find('Die Anzahl durchgeführter Tests', start)
        if end >= 0:
            end -= 1
    if end == -1:
        end = txt.find('Gemeldete Tests nach Alter und Geschlecht', start)
    if end > start > 0 and end > start:
        tests_table = txt[start:end]
        # the numbers are sometimes separated with spaces for >1k values
        pcr = re.compile(r'(\d+)\s(\d+)')
        tests_table = pcr.sub(r'\1\2', tests_table)
        number_of_tests = c.txt_to_int(c.search(r'(\n\s+)?{}\s+\d+\s+(\d+)'.format(canton), tests_table, index=2))
        positivity_rate = c.txt_to_float(c.search(r'(\n\s+)?{}\s+.*\s([0-9]+\.[0-9]+)\n'.format(canton), tests_table, index=2))

Пример #5
def parse_pcr_tot_tests(txt):
    tot_tests = c.txt_to_int(
        c.search(r'insgesamt auf( .ber| rund| mehr als)? ([\d\s.]+)\.',
    pcr_pos = txt.find('Tests')
    if tot_tests is None and pcr_pos > 0:
        # extract the line with Total / Totale Anzahl
        pcr_pos = txt.find('\n', pcr_pos) + 1
        pcr_end_pos = txt.find('\n', pcr_pos)
        line = txt[pcr_pos:pcr_end_pos]
        # replace whitespace between numbers '937 488' -> '937488'
        line = re.sub(r'(\d)\s(\d)', r'\1\2', line)
        # match the value
        pcr = re.compile(r'(Totale Anzahl|Total)\s+\+?(\d+)\s')
        res = pcr.match(line)
        if res is not None:
            tot_tests = c.txt_to_int(res[2])
            return tot_tests
        res = re.search(r'Total durchgef.hrte Tests\s+(\d+)\s+\+?\d+\s', line)
        if res is not None:
            tot_tests = c.txt_to_int(res[1])
    return tot_tests