示例#1
0
def crawl_pdf(date, country):
    if country == "Northern Ireland":
        ym = dateparser.parse(date).strftime('%Y-%m')
        dmy = dateparser.parse(date).strftime('%d.%m.%y')
        pdf_url = "https://www.publichealth.hscni.net/sites/default/files/{}/COVID-19 Surveillance Bulletin {}.pdf".format(
            ym, dmy)
        local_pdf_file = "data/raw/Daily_bulletin_DoH_{}.pdf".format(date)

        if not os.path.exists(local_pdf_file):
            r = requests.get(pdf_url)
            with open(local_pdf_file, "wb") as f:
                f.write(r.content)

        text = get_text_from_pdf(local_pdf_file)
        results = parse_totals_pdf_text(country, text)

        if results is None:
            sys.stderr.write(
                "Can't find numbers. Perhaps the page format has changed?\n")
            sys.exit(1)
        elif results["Date"] != date:
            sys.stderr.write("Page is dated {}, but want {}\n".format(
                results["Date"], date))
            sys.exit(1)

        print_totals(results)
        #save_indicators(results)
        save_indicators_to_sqlite(results)

        daily_areas = parse_daily_areas_pdf(date, country, local_pdf_file)
        if daily_areas is not None:
            save_daily_areas(date, country, daily_areas)
            save_daily_areas_to_sqlite(date, country, daily_areas)
示例#2
0
def test_parse_daily_areas_wales_pdf():
    for file in sorted(glob.glob("data/raw/phw/LAs-*.pdf")):
        m = re.match(r".+(\d{4}-\d{2}-\d{2})\.pdf", file)
        date = m.group(1)
        result = parse_daily_areas_pdf(date, "Wales", file)
        assert len(result) > 1
        assert result[0] == ['Date', 'Country', 'AreaCode', 'Area', 'TotalCases']
        for row in result[1:]:
            assert row[0] == date
            assert row[1] == "Wales"
            assert row[2] is not None # Area code can be blank (e.g. 'To be confirmed')
            assert len(row[3]) > 0
            assert int(row[4]) >= 0
示例#3
0
def crawl_pdf(date, country, check_only):
    if country == "Northern Ireland":

        dt = dateparser.parse(date, date_formats=['%Y-%m-%d'], locales=["en-GB"])
        ym = dt.strftime('%Y-%m')
        dmy = dt.strftime('%d.%m.%y')
        # the top-level page containing links to PDFs
        html_url = "https://www.publichealth.hscni.net/publications/covid-19-surveillance-reports"
        # the PDF itself
        pdf_url = "https://www.publichealth.hscni.net/sites/default/files/{}/COVID-19 Surveillance Bulletin {}.pdf".format(ym, dmy)
        local_pdf_file = "data/raw/Daily_bulletin_DoH_{}.pdf".format(date)

        if not os.path.exists(local_pdf_file):
            r = requests.get(html_url)
            if "{}.pdf".format(dmy) not in r.text:
                if check_only:
                    return DatasetUpdate.UPDATE_NOT_AVAILABLE
                sys.stderr.write("Page is dated ?, but want {}\n".format(date))
                sys.exit(1)

            if check_only:
                return DatasetUpdate.UPDATE_AVAILABLE

            r = requests.get(pdf_url)
            with open(local_pdf_file, "wb") as f:
                f.write(r.content)

        if check_only:
            return DatasetUpdate.ALREADY_UPDATED

        text = get_text_from_pdf(local_pdf_file)
        results = parse_totals_pdf_text(country, text)

        if results is None:
            sys.stderr.write("Can't find numbers. Perhaps the page format has changed?\n")
            sys.exit(1)
        elif results["Date"] != date:
            sys.stderr.write("Page is dated {}, but want {}\n".format(results["Date"], date))
            sys.exit(1)

        print_totals(results)
        #save_indicators(results)
        save_indicators_to_sqlite(results)

        daily_areas = parse_daily_areas_pdf(date, country, local_pdf_file)
        if daily_areas is not None:
            save_daily_areas(date, country, daily_areas)
            save_daily_areas_to_sqlite(date, country, daily_areas)
示例#4
0
def test_parse_daily_areas_ni():
    for file in sorted(glob.glob("data/raw/Daily_bulletin_DoH_*.pdf")):
        m = re.match(r".+(\d{4}-\d{2}-\d{2})\.pdf", file)
        date = m.group(1)
        if date <= "2020-03-25":
            # older pages don't have case numbers
            continue
        result = parse_daily_areas_pdf(date, "Northern Ireland", file)
        if result is None:
            # usually (but not always) because weekends don't have case numbers
            continue
        assert len(result) > 1
        assert result[0] == ['Date', 'Country', 'AreaCode', 'Area', 'TotalCases']
        for row in result[1:]:
            assert row[0] == date
            assert row[1] == "Northern Ireland"
            assert row[2] is not None # Area code can be blank (e.g. 'Unknown')
            assert len(row[3]) > 0
            assert int(row[4]) >= 0