def parse_daily_areas_pdf(date, country, local_pdf_file): if country == "Northern Ireland": pdf = pdfplumber.open(local_pdf_file) for page in pdf.pages: try: table = page.extract_table() if table[0][0] == "Local Government District": output_rows = [[ "Date", "Country", "AreaCode", "Area", "TotalCases" ]] for table_row in table[1:]: if table_row[0].lower() == "total": continue area = normalize_whitespace(titlecase(table_row[0])) area = area.replace("Ards and North Down", "North Down and Ards") area_code = lookup_local_government_district_code(area) cases = normalize_int(table_row[1]) output_row = [date, country, area_code, area, cases] output_rows.append(output_row) return output_rows except IndexError: pass # no table on page elif country == "Wales": pdf = pdfplumber.open(local_pdf_file) for page in pdf.pages: try: table = page.extract_table( table_settings={ # use text alignment since the table doesn't have lines "vertical_strategy": "text", "horizontal_strategy": "text" }) found_start = False output_rows = [[ "Date", "Country", "AreaCode", "Area", "TotalCases" ]] for table_row in table: if table_row[0] is not None and table_row[0].startswith( "Aneurin"): found_start = True if found_start: area = ( normalize_whitespace(table_row[2]).replace( "Anglesey", "Isle of Anglesey").replace( "ff", "ff") # fix ligatures .replace("fi", "fi")) if area.startswith("Wales total"): continue area_code = lookup_local_authority_code(area) cases = normalize_int(table_row[4]) output_row = [date, country, area_code, area, cases] output_rows.append(output_row) if table_row[2] is not None and normalize_whitespace( table_row[2]) == 'Resident outside Wales': break return convert_wales_la_to_hb(date, country, output_rows) except IndexError: pass # no table on page return None
def crawl_arcgis(date, country, check_only): if country == "UK": item_id = "bc8ee90225644ef7a6f4dd1b13ea1d67" local_data_file = "data/raw/DailyIndicators-{}.xslx".format(date) ret = download_arcgis_item(date, item_id, local_data_file, check_only) if check_only: return ret df = pd.read_excel(local_data_file) print(df) d = df.to_dict("records")[0] date = d["DateVal"].strftime("%Y-%m-%d") with sqlite3.connect('data/covid-19-uk.db') as conn: c = conn.cursor() c.execute( f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'ConfirmedCases', {d['TotalUKCases']})" ) c.execute( f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'Deaths', {d['TotalUKDeaths']})" ) c.execute( f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'ConfirmedCases', {d['EnglandCases']})" ) c.execute( f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'Deaths', {d['EnglandDeaths']})" ) # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Scotland', 'ConfirmedCases', {d['ScotlandCases']})") # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Scotland', 'Deaths', {d['ScotlandDeaths']})") # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Wales', 'ConfirmedCases', {d['WalesCases']})") # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Wales', 'Deaths', {d['WalesDeaths']})") # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Northern Ireland', 'ConfirmedCases', {d['NICases']})") # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Northern Ireland', 'Deaths', {d['NIDeaths']})") elif country == "England": item_id = "b684319181f94875a6879bbc833ca3a6" local_data_file = "data/raw/CountyUAs_cases_table-{}.csv".format(date) ret = download_arcgis_item(date, item_id, local_data_file, check_only) if check_only: return ret df = pd.read_csv(local_data_file) df["Date"] = date df["Country"] = "England" df = df.rename(columns={"GSS_CD": "AreaCode", "GSS_NM": "Area"}) df = df[["Date", "Country", "AreaCode", "Area", "TotalCases"]] daily_areas = df.to_dict("split")["data"] for row in daily_areas: row[4] = normalize_int(normalize_whitespace(row[4])) daily_areas = [["Date", "Country", "AreaCode", "Area", "TotalCases"] ] + daily_areas #save_daily_areas(date, country, daily_areas) save_daily_areas_to_sqlite(date, country, daily_areas)
def parse_tests(country, html): def is_testing_table(table): headers = [th.text for th in table.findAll("th")] return "Tests" in headers soup = BeautifulSoup(html, features="html.parser") tables = soup.find_all("table") testing_tables = [table for table in tables if is_testing_table(table)] if len(testing_tables) == 0: print("Testing table not found") return None elif len(testing_tables) > 1: print("More than one testing table found") return None testing_table = testing_tables[0] table_rows = testing_table.findAll("tr") if len(table_rows) != 3: print("Expecting 3 table rows") return None daily_row = [td.text for td in table_rows[1].findAll("td")] total_row = [td.text for td in table_rows[2].findAll("td")] text = get_text_from_html(html) pattern_dict = { "Date": (r"As of (?P<Time>\d+\s*(am|pm)?) (on )?(?P<Date>.+?),", date_value_parser_fn) } result = parse_totals_general(pattern_dict, country, text) result["DailyTestsPerformed"] = normalize_int(daily_row[1]) result["DailyPeopleTested"] = normalize_int(daily_row[2]) result["DailyPositive"] = normalize_int(daily_row[3]) result["TotalTestsPerformed"] = normalize_int(total_row[1]) result["TotalPeopleTested"] = normalize_int(total_row[2]) result["TotalPositive"] = normalize_int(total_row[3]) return result
def parse_daily_areas_json(date, country, json_data): if country == "England": output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]] for area_code, o in json_data["utlas"].items(): area = o["name"]["value"] cases = normalize_int(o["totalCases"]["value"]) if area_code != lookup_local_authority_code(area): print("Area code mismatch for {}, JSON file gave {}, but lookup was {}".format(area, area_code, lookup_local_authority_code(area))) return None output_row = [date, country, area_code, area, cases] output_rows.append(output_row) return output_rows return None
def parse_totals_pdf(date, country, local_pdf_file): if country == "Northern Ireland": text = get_text_from_pdf(local_pdf_file) pattern_dict = { "Date": (r"Date generated: (?P<Date>[\d,]+/[\d,]+/[\d,]+)", date_value_parser_fn), "Tests": (r"Number of Individuals tested( for COVID-19| for SARS-COV2 Virus)?:? (?P<Tests>[\d,]+)", int_value_parser_fn), "ConfirmedCases": (r"(Number of Individuals (with confirmed|testing positive for) (COVID-19|SARS-COV2 Virus)|Cumulative number of laboratory confirmed COVID-19 cases):? (?P<ConfirmedCases>[\d,]+)", int_value_parser_fn), "Deaths": (r"(Total|Cumulative) number of (Trust |reported )?deaths( associated with COVID-19)?: (?P<Deaths>[\d,]+)", int_value_parser_fn), } result = parse_totals_general(pattern_dict, country, text) return result elif country == "Wales": pdf = pdfplumber.open(local_pdf_file) for page in pdf.pages: try: table = page.extract_table( table_settings={ # use text alignment since the table doesn't have lines "horizontal_strategy": "text" }) result = {"Date": date, "Country": country} for table_row in table: if table_row[0] == "": continue label = table_row[0].replace("\t", " ") value = normalize_int(table_row[1]) if label == "Cumulative individuals tested": result["Tests"] = value elif label == "Cumulative confirmed COVID-19 case total": result["ConfirmedCases"] = value elif label == "Cumulative number of suspected COVID-19 deaths* reported to PHW": # Get deaths from XLSX after this date if date < "2020-04-29": result["Deaths"] = value return result except IndexError: pass # no table on page return None
def parse_daily_areas_pdf(date, country, local_pdf_file): if country == "Northern Ireland": pdf = pdfplumber.open(local_pdf_file) for page in pdf.pages: try: table = page.extract_table() if table[0][0] == "Local Government District": output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]] for table_row in table[1:]: if table_row[0].lower() == "total": continue area = normalize_whitespace(titlecase(table_row[0])) area_code = lookup_local_government_district_code(area) cases = normalize_int(table_row[1]) output_row = [date, country, area_code, area, cases] output_rows.append(output_row) return output_rows except IndexError: pass # no table on page return None
def int_value_parser_fn(value): return normalize_int(value)
def test_normalize_int(): assert normalize_int("1") == 1 assert normalize_int("1,001") == 1001 assert normalize_int("seven") == 7
def parse_tests(country, html): def is_testing_table(table): headers = [th.text for th in table.findAll("th")] return "Tests" in headers soup = BeautifulSoup(html, features="html.parser") tables = soup.find_all("table") testing_tables = [table for table in tables if is_testing_table(table)] if len(testing_tables) == 0: print("Testing table not found") return None elif len(testing_tables) > 1: print("More than one testing table found") return None testing_table = testing_tables[0] table_rows = testing_table.findAll("tr") if len(table_rows) != 3: print("Expecting 3 table rows") return None daily_row = [td.text for td in table_rows[1].findAll("td")] total_row = [td.text for td in table_rows[2].findAll("td")] text = get_text_from_html(html) pattern_dict = { "Date": (r"As of (?P<Time>\d+\s*(am|pm)?) (on )?(?P<Date>.+?),", date_value_parser_fn) } result = parse_totals_general(pattern_dict, country, text) result["DailyTestsPerformed"] = normalize_int(daily_row[1]) result["DailyPeopleTested"] = normalize_int_with_unavailable(daily_row[2]) result["DailyPositive"] = normalize_int(daily_row[3]) result["TotalTestsPerformed"] = normalize_int(total_row[1]) result["TotalPeopleTested"] = normalize_int_with_unavailable(total_row[2]) result["TotalPositive"] = normalize_int(total_row[3]) def is_pillar_table(table): headers = [th.text for th in table.findAll("th")] return "Pillar 1" in headers pillar_tables = [table for table in tables if is_pillar_table(table)] if len(pillar_tables) == 0: # no pillar tables return result elif len(pillar_tables) != 2: print("Expecting two pillar tables (daily and cumulative)") return None for table_num, pillar_table in enumerate(pillar_tables): daily_or_total = "Daily" if table_num == 0 else "Total" table_rows = pillar_table.findAll("tr") if len(table_rows) != 4: print("Expecting 4 table rows") return None for i, col in enumerate(table_rows[0].findAll(re.compile("th|td"))): if col.text.startswith("Pillar"): pillar = remove_whitespace(col.text) for row in table_rows[1:]: test_stat = normalize_whitespace(row.findAll("td")[0].text) if test_stat == "Tests": test_stat = "TestsPerformed" elif test_stat == "People tested": test_stat = "PeopleTested" indicator = "{}{}{}".format(daily_or_total, pillar, test_stat) str_val = row.findAll("td")[i].text val = "" if str_val == "-" else normalize_int_with_unavailable( str_val) result[indicator] = val def is_pillar2_breakdown_table(table): headers = [th.text for th in table.findAll("th")] return any([header.startswith("In-person") for header in headers]) pillar2_breakdown_tables = [ table for table in tables if is_pillar2_breakdown_table(table) ] if len(pillar2_breakdown_tables) == 0: # no pillar 2 breakdown table return result elif len(pillar2_breakdown_tables) > 1: print("More than one pillar 2 breakdown table found") return None pillar2_breakdown_table = pillar2_breakdown_tables[0] table_rows = pillar2_breakdown_table.findAll("tr") if len(table_rows) not in (3, 4): print("Expecting 3 (or 4) table rows in pillar 2 breakdown table") return None daily_row = [td.text for td in table_rows[1].findAll("td")] total_row = [td.text for td in table_rows[2].findAll("td")] result["DailyPillar2InPersonRoutes"] = normalize_int(daily_row[1]) result["DailyPillar2DeliveryRoutes"] = normalize_int(daily_row[2]) result["TotalPillar2InPersonRoutes"] = normalize_int(total_row[1]) result["TotalPillar2DeliveryRoutes"] = normalize_int(total_row[2]) return result
def normalize_int_with_unavailable(num): if num.lower() == "unavailable": return "" return normalize_int(num)
soup = BeautifulSoup(html, features="html.parser") text = soup.get_text() text = text.replace(u"\xa0", u" ") # replace non-breaking spaces with regular spaces patterns = [uk_pattern, wales_pattern, scotland_pattern, ni_pattern] for pattern in patterns: m = re.search(pattern, text) if m is not None: groups = m.groupdict() date = dateparser.parse(groups["date"]).strftime("%Y-%m-%d") country = normalize_whitespace(groups.get("country")).replace( "Scottish", "Scotland") tests = normalize_int(groups.get("tests", float("nan"))) positive_tests = normalize_int(groups["positive_tests"]) negative_tests = normalize_int( groups.get("negative_tests", float("nan"))) deaths = normalize_int(groups.get("deaths", float("nan"))) if not math.isnan(tests): print("{},{},{},{}".format(date, country, "Tests", tests)) # with open( # "data/daily/indicators/covid-19-{}-{}-tests.csv".format( # date, format_country(country) # ), # "w", # ) as f: # f.write("{},{},{},{}\n".format(date, country, "Tests", tests)) if not math.isnan(positive_tests): print("{},{},{},{}".format(date, country, "ConfirmedCases",