def crawl_pdf(date, country): if country == "Northern Ireland": ym = dateparser.parse(date).strftime('%Y-%m') dmy = dateparser.parse(date).strftime('%d.%m.%y') pdf_url = "https://www.publichealth.hscni.net/sites/default/files/{}/COVID-19 Surveillance Bulletin {}.pdf".format( ym, dmy) local_pdf_file = "data/raw/Daily_bulletin_DoH_{}.pdf".format(date) if not os.path.exists(local_pdf_file): r = requests.get(pdf_url) with open(local_pdf_file, "wb") as f: f.write(r.content) text = get_text_from_pdf(local_pdf_file) results = parse_totals_pdf_text(country, text) if results is None: sys.stderr.write( "Can't find numbers. Perhaps the page format has changed?\n") sys.exit(1) elif results["Date"] != date: sys.stderr.write("Page is dated {}, but want {}\n".format( results["Date"], date)) sys.exit(1) print_totals(results) #save_indicators(results) save_indicators_to_sqlite(results) daily_areas = parse_daily_areas_pdf(date, country, local_pdf_file) if daily_areas is not None: save_daily_areas(date, country, daily_areas) save_daily_areas_to_sqlite(date, country, daily_areas)
def test_parse_totals_pdf_text_ni(): for file in sorted(glob.glob("data/raw/Daily_bulletin_DoH_*.pdf")): m = re.match(r".+(\d{4}-\d{2}-\d{2})\.pdf", file) date = m.group(1) if date <= "2020-03-24": # older pages cannot be parsed with current parser continue text = get_text_from_pdf(file) result = parse_totals_pdf_text("Northern Ireland", text) assert result["Country"] == "Northern Ireland" assert result["Date"] == date assert result["Tests"] >= 0 assert result["ConfirmedCases"] >= 0 assert result["Deaths"] >= 0
def crawl_pdf(date, country, check_only): if country == "Northern Ireland": dt = dateparser.parse(date, date_formats=['%Y-%m-%d'], locales=["en-GB"]) ym = dt.strftime('%Y-%m') dmy = dt.strftime('%d.%m.%y') # the top-level page containing links to PDFs html_url = "https://www.publichealth.hscni.net/publications/covid-19-surveillance-reports" # the PDF itself pdf_url = "https://www.publichealth.hscni.net/sites/default/files/{}/COVID-19 Surveillance Bulletin {}.pdf".format(ym, dmy) local_pdf_file = "data/raw/Daily_bulletin_DoH_{}.pdf".format(date) if not os.path.exists(local_pdf_file): r = requests.get(html_url) if "{}.pdf".format(dmy) not in r.text: if check_only: return DatasetUpdate.UPDATE_NOT_AVAILABLE sys.stderr.write("Page is dated ?, but want {}\n".format(date)) sys.exit(1) if check_only: return DatasetUpdate.UPDATE_AVAILABLE r = requests.get(pdf_url) with open(local_pdf_file, "wb") as f: f.write(r.content) if check_only: return DatasetUpdate.ALREADY_UPDATED text = get_text_from_pdf(local_pdf_file) results = parse_totals_pdf_text(country, text) if results is None: sys.stderr.write("Can't find numbers. Perhaps the page format has changed?\n") sys.exit(1) elif results["Date"] != date: sys.stderr.write("Page is dated {}, but want {}\n".format(results["Date"], date)) sys.exit(1) print_totals(results) #save_indicators(results) save_indicators_to_sqlite(results) daily_areas = parse_daily_areas_pdf(date, country, local_pdf_file) if daily_areas is not None: save_daily_areas(date, country, daily_areas) save_daily_areas_to_sqlite(date, country, daily_areas)