def parse_weekly_data(filename): txt = c.pdf_to_text(filename) txt = re.sub(r'(\d)\s(\d)', r'\1\2', txt) year = c.search(r'Stand:\s\d+\.\d+\.(\d{4})', txt) week = int(c.search(r'Liechtenstein - Woche (\d+) ', txt)) tot_tests = '' tot_antigen_tests = '' pcr_pos = txt.find('Gemeldete Tests') if pcr_pos > 0: pcr_pos = txt.find('PCR', pcr_pos) pcr_end_pos = txt.find('\n', pcr_pos) assert pcr_end_pos > pcr_pos line = txt[pcr_pos:pcr_end_pos] #line = re.sub(r'(\d)\s(\d)', r'\1\2', line) line = re.sub(r'\s+', r' ', line) tot_tests = c.txt_to_int(line.split(' ')[-2]) # Antigen tests pcr_pos = txt.find('Antigen-Schnelltests', pcr_pos) pcr_end_pos = txt.find('\n', pcr_pos) assert pcr_end_pos > pcr_pos line = txt[pcr_pos:pcr_end_pos] line = re.sub(r'\s+', r' ', line) tot_antigen_tests = c.txt_to_int(line.split(' ')[-2]) positivity_rate = '' antigen_positivity_rate = '' positivity_pos = txt.find('\nPositivit') if positivity_pos == -1: positivity_pos = txt.find('\nAnteil positiver Tests') if positivity_pos > 0: positivity_pos = txt.find('PCR', positivity_pos) positivity_end_pos = txt.find('\n', positivity_pos) assert positivity_end_pos > positivity_pos line = txt[positivity_pos:positivity_end_pos] line = re.sub(r'\s+', r' ', line) positivity_rate = line.split(' ')[-1] positivity_rate = c.txt_to_float(positivity_rate.replace('%', '')) # Antigen tests positivity_pos = txt.find('Antigen-Schnelltest', positivity_pos) positivity_end_pos = txt.find('\n', positivity_pos) assert positivity_end_pos > positivity_pos line = txt[positivity_pos:positivity_end_pos] line = re.sub(r'\s+', r' ', line) antigen_positivity_rate = line.split(' ')[-1] try: antigen_positivity_rate = c.txt_to_float( positivity_rate.replace('%', '')) except: pass print( f'{year},{week},{tot_tests},{positivity_rate},{tot_antigen_tests},{antigen_positivity_rate},{filename}' )
def parse_data(filename): txt = c.pdf_to_text(filename) date_time = c.search(r'Stand (\d.*) Uhr', txt) if date_time is None: date = c.search(r'Stand\: (\d{2}\.\d{2}\.20\d{2})', txt) time = c.search(r'Zeit: (\d+:\d{2})', txt) if date is not None and time is not None: date_time = '{} {}'.format(date, time) date = c.parse_date(date_time) tot_tests = parse_pcr_tot_tests(txt) positivity_rate = c.txt_to_float( c.search(r'Bei (\d+)% dieser Tests fiel das Resultat positiv aus', txt)) if positivity_rate is None: positivity_rate = c.txt_to_float( c.search(r'Positivit.tsrate( \*+| \(%\)|\*+)?\s+(\d\.?\d?)[%\s]', txt, index=2)) if positivity_rate is None: positivity_rate = c.txt_to_float( c.search(r'Anteil positive Tests \(%\)(\d)?\s+(\d\.?\d?)[%\s]', txt, index=2)) isolated = c.txt_to_int( c.search( r'(\d+)\s+(F.lle|Personen aufgrund einer laborbest.tigten COVID-19 Erkrankung)? in\sIsolation', txt, index=1)) quarantined = c.txt_to_int( c.search( r'(\d+)\s?(in|Kontaktpersonen\sin\s.rztlich\sverordneter)? Quarant.ne', txt)) quarantined_travel = None if isolated is None or quarantined is None: pos = txt.find('Contact Tracing') if pos > 0: pcr = re.compile( r'Total\s?(\*+|\(%\))?\s+(\d+\s?\d+|\d+)\s+(\d+\s?\d+|\d+)\s+(\d+ ?\d+|\d+)?\n' ) #pcr = re.compile(r'Total\s?(\*+|\(%\))?\s+(\d+)\s+(\d+)\s+(\d+|\d+\s?\d+)?') res = pcr.search(txt, pos) if res is not None: isolated = c.txt_to_int(res[2]) quarantined = c.txt_to_int(res[3]) quarantined_travel = c.txt_to_int(res[4].strip()) print('{},{},{},{},{},{},{}'.format(date, tot_tests or '', positivity_rate or '', isolated or '', quarantined or '', quarantined_travel or '', filename))
def parse_weekly_data(filename): txt = c.pdf_to_text(filename) week = c.search(r'Liechtenstein - Woche (\d+) ', txt) tot_tests = None pcr_pos = txt.find('PCR-Tests') if pcr_pos > 0: pcr_pos = txt.find('\n', pcr_pos) + 1 pcr_end_pos = txt.find('\n', pcr_pos) assert pcr_end_pos > pcr_pos line = txt[pcr_pos:pcr_end_pos] line = re.sub(r'(\d)\s(\d)', r'\1\2', line) line = re.sub(r'\s+', r' ', line) tot_tests = c.txt_to_int(line.split(' ')[-2]) positivity_rate = None positivity_pos = txt.find('\nPositivit') if positivity_pos == -1: positivity_pos = txt.find('\nAnteil positive Tests') if positivity_pos > 0: positivity_pos += 1 positivity_end_pos = txt.find('\n', positivity_pos) assert positivity_end_pos > positivity_pos line = txt[positivity_pos:positivity_end_pos] line = re.sub(r'\s+', r' ', line) positivity_rate = line.split(' ')[-1] positivity_rate = c.txt_to_float(positivity_rate.replace('%', '')) print('{},{},{},{}'.format(week, tot_tests or '', positivity_rate or '', filename))
def parse_canton_data(canton, filename): txt = c.pdf_to_text(filename) # pylint: disable=W0105 """ Coronavirus-Krankheit-2019 (COVID-19) Eidgen<C3><B6>ssisches Departement des Innern EDI Bundesamt f<C3><BC>r Gesundheit BAG Direktionsbereich <C3><96>ffentliche Gesundheit Situationsbericht zur epidemiologischen Lage in der Schweiz und im F<C3><BC>rstentum Liechtenstein - Woche 28 (06.-12.07.2020) """ year = c.search(r'Stand:\s\d+\.\d+\.(\d{4})', txt) week = int(c.search(r'Liechtenstein - Woche (\d+)', txt)) """ Canton, tests of previous-week then current-week AG 5478 3588 808 529 1.3 1.8 AI 96 55 595 341 0.0 0.0 AR 391 249 708 451 0.5 1.2 BE 6924 4652 669 449 0.4 0.9 ... """ start = txt.find('Anzahl PCR-Tests in der Schweiz') if start == -1: start = txt.find('Anzahl durchgeführte PCR-Tests in der Schweiz') if start == -1: start = txt.find('Anzahl durchgeführte Tests in der Schweiz') if start == -1: start = txt.find('Anzahl gemeldeter Tests, Anzahl Tests pro') if start > 0: start = txt.find(r' AG ', start) else: start = 0 end = txt.find('Tabelle 4. Durchgeführte Tests nach Kalenderwoche', start) if end == -1: end = txt.find('Die Altersverteilung der', start) if end == -1: end = txt.find('Die Anzahl durchgeführter Tests', start) if end >= 0: end -= 1 if end == -1: end = txt.find('Gemeldete Tests nach Alter und Geschlecht', start) if end > start > 0 and end > start: tests_table = txt[start:end] # the numbers are sometimes separated with spaces for >1k values pcr = re.compile(r'(\d+)\s(\d+)') tests_table = pcr.sub(r'\1\2', tests_table) number_of_tests = c.txt_to_int(c.search(r'(\n\s+)?{}\s+\d+\s+(\d+)'.format(canton), tests_table, index=2)) positivity_rate = c.txt_to_float(c.search(r'(\n\s+)?{}\s+.*\s([0-9]+\.[0-9]+)\n'.format(canton), tests_table, index=2)) print(f'{year},{week},{number_of_tests},{positivity_rate},{filename}')
def parse_pcr_tot_tests(txt): tot_tests = c.txt_to_int( c.search(r'insgesamt auf( .ber| rund| mehr als)? ([\d\s.]+)\.', txt, index=2)) pcr_pos = txt.find('Tests') if tot_tests is None and pcr_pos > 0: # extract the line with Total / Totale Anzahl pcr_pos = txt.find('\n', pcr_pos) + 1 pcr_end_pos = txt.find('\n', pcr_pos) line = txt[pcr_pos:pcr_end_pos] # replace whitespace between numbers '937 488' -> '937488' line = re.sub(r'(\d)\s(\d)', r'\1\2', line) # match the value pcr = re.compile(r'(Totale Anzahl|Total)\s+\+?(\d+)\s') res = pcr.match(line) if res is not None: tot_tests = c.txt_to_int(res[2]) return tot_tests res = re.search(r'Total durchgef.hrte Tests\s+(\d+)\s+\+?\d+\s', line) if res is not None: tot_tests = c.txt_to_int(res[1]) return tot_tests