def parse_daily_areas_pdf(date, country, local_pdf_file): if country == "Northern Ireland": pdf = pdfplumber.open(local_pdf_file) for page in pdf.pages: try: table = page.extract_table() if table[0][0] == "Local Government District": output_rows = [[ "Date", "Country", "AreaCode", "Area", "TotalCases" ]] for table_row in table[1:]: if table_row[0].lower() == "total": continue area = normalize_whitespace(titlecase(table_row[0])) area = area.replace("Ards and North Down", "North Down and Ards") area_code = lookup_local_government_district_code(area) cases = normalize_int(table_row[1]) output_row = [date, country, area_code, area, cases] output_rows.append(output_row) return output_rows except IndexError: pass # no table on page elif country == "Wales": pdf = pdfplumber.open(local_pdf_file) for page in pdf.pages: try: table = page.extract_table( table_settings={ # use text alignment since the table doesn't have lines "vertical_strategy": "text", "horizontal_strategy": "text" }) found_start = False output_rows = [[ "Date", "Country", "AreaCode", "Area", "TotalCases" ]] for table_row in table: if table_row[0] is not None and table_row[0].startswith( "Aneurin"): found_start = True if found_start: area = ( normalize_whitespace(table_row[2]).replace( "Anglesey", "Isle of Anglesey").replace( "ff", "ff") # fix ligatures .replace("fi", "fi")) if area.startswith("Wales total"): continue area_code = lookup_local_authority_code(area) cases = normalize_int(table_row[4]) output_row = [date, country, area_code, area, cases] output_rows.append(output_row) if table_row[2] is not None and normalize_whitespace( table_row[2]) == 'Resident outside Wales': break return convert_wales_la_to_hb(date, country, output_rows) except IndexError: pass # no table on page return None
def crawl_ni(use_local=False): headers = {"X-PowerBI-ResourceKey": "df16636e-99fe-4801-a5a1-20466a39f7bf"} request_json = read_json("data/raw/ni/request-cumulative-tests.json") if use_local: file = "data/raw/ni/response-cumulative-tests.json" else: file = "https://wabi-north-europe-api.analysis.windows.net/public/reports/querydata?synchronous=true" json_data = read_json_post(file, headers, request_json) tests = json_data["results"][0]["result"]["data"]["dsr"]["DS"][0]["PH"][0]["DM0"] tests = {datetime.datetime.fromtimestamp(elt["C"][0] / 1000).strftime('%Y-%m-%d'): elt["C"][1:] for elt in tests} df = pd.DataFrame.from_dict(tests, orient='index', columns=["Tests", "ConfirmedCases"]) df["Date"] = df.index df = df.fillna(method="ffill") # fill missing values from previous save_indicators_df_to_sqlite(df, "Northern Ireland", "Tests") save_indicators_df_to_sqlite(df, "Northern Ireland", "ConfirmedCases") request_json = read_json("data/raw/ni/request-cumulative-deaths.json") if use_local: file = "data/raw/ni/response-cumulative-deaths.json" else: file = "https://wabi-north-europe-api.analysis.windows.net/public/reports/querydata?synchronous=true" json_data = read_json_post(file, headers, request_json) deaths = json_data["results"][0]["result"]["data"]["dsr"]["DS"][0]["PH"][0]["DM0"] deaths_dict = {} for idx, elt in enumerate(deaths): date = datetime.datetime.fromtimestamp(elt["C"][0] / 1000).strftime('%Y-%m-%d') if len(elt["C"]) == 1 and elt.get("R", None) == 2: # R means repeat? # use previous value = [deaths[idx - 1]["C"][1]] else: value = [elt["C"][1]] deaths_dict[date] = value df = pd.DataFrame.from_dict(deaths_dict, orient='index', columns=["Deaths"]) df["Date"] = df.index save_indicators_df_to_sqlite(df, "Northern Ireland", "Deaths") request_json = read_json("data/raw/ni/request-area-cases.json") if use_local: file = "data/raw/ni/response-area-cases.json" else: file = "https://wabi-north-europe-api.analysis.windows.net/public/reports/querydata?synchronous=true" json_data = read_json_post(file, headers, request_json) area_cases = json_data["results"][0]["result"]["data"]["dsr"]["DS"][0]["PH"][1]["DM1"] area_cases = {elt["C"][0]: [elt["C"][2]] for elt in area_cases} df = pd.DataFrame.from_dict(area_cases, orient='index', columns=["TotalCases"]) df["Area"] = df.index df["AreaCode"] = df["Area"].apply(lambda lgd: lookup_local_government_district_code(lgd)) df["Country"] = "Northern Ireland" df["Date"] = json_data["results"][0]["result"]["data"]["timestamp"].split("T")[0] df = df[["Date", "Country", "AreaCode", "Area", "TotalCases"]] save_cases_df_to_sqlite(df, "Northern Ireland", delete_old=False)
def parse_daily_areas_pdf(date, country, local_pdf_file): if country == "Northern Ireland": pdf = pdfplumber.open(local_pdf_file) for page in pdf.pages: try: table = page.extract_table() if table[0][0] == "Local Government District": output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]] for table_row in table[1:]: if table_row[0].lower() == "total": continue area = normalize_whitespace(titlecase(table_row[0])) area_code = lookup_local_government_district_code(area) cases = normalize_int(table_row[1]) output_row = [date, country, area_code, area, cases] output_rows.append(output_row) return output_rows except IndexError: pass # no table on page return None
def test_lookup_local_government_district_code(): assert lookup_local_government_district_code("Antrim and Newtownabbey") == "N09000001" assert lookup_local_government_district_code("Bogus") == ""