예제 #1
0
def test_parse_daily_areas_wales():
    for file in sorted(
            glob.glob(
                "data/raw/coronavirus-covid-19-number-of-cases-in-wales-*.html"
            )):
        m = re.match(r".+(\d{4}-\d{2}-\d{2})\.html", file)
        date = m.group(1)
        if date <= "2020-03-18":
            # older pages cannot be parsed with current parser
            continue
        if date >= "2020-04-08":
            # daily areas no longer published on the HTML page
            continue
        with open(file) as f:
            html = f.read()
            result = parse_daily_areas(date, "Wales", html)
            assert len(result) > 1
            assert result[0] == [
                'Date', 'Country', 'AreaCode', 'Area', 'TotalCases'
            ]
            for row in result[1:]:
                assert row[0] == date
                assert row[1] == "Wales"
                assert row[
                    2] is not None  # Area code can be blank (e.g. 'To be confirmed')
                assert len(row[3]) > 0
                assert int(row[4]) >= 0
예제 #2
0
def test_parse_daily_areas_scotland():
    for file in sorted(
            glob.glob(
                "data/raw/coronavirus-covid-19-number-of-cases-in-scotland-*.html"
            )):
        m = re.match(r".+(\d{4}-\d{2}-\d{2})\.html", file)
        date = m.group(1)
        if date <= "2020-03-18":
            # older pages cannot be parsed with current parser
            continue
        with open(file) as f:
            html = f.read()
            result = parse_daily_areas(date, "Scotland", html)
            assert len(result) > 1
            assert result[0] == [
                'Date', 'Country', 'AreaCode', 'Area', 'TotalCases'
            ]
            for row in result[1:]:
                assert row[0] == date
                assert row[1] == "Scotland"
                assert row[3] == "Golden Jubilee National Hospital" or len(
                    row[2]) > 0
                assert len(row[3]) > 0
                assert row[3] == "Golden Jubilee National Hospital" or row[
                    4] == "NaN" or int(row[4]) >= 0
예제 #3
0
def crawl_html(date, country, check_only):
    html_url = get_html_url(date, country)
    local_html_file = "data/raw/coronavirus-covid-19-number-of-cases-in-{}-{}.html".format(
        format_country(country), date)
    save_html_file = False

    try:
        with open(local_html_file) as f:
            html = f.read()
        if check_only:
            return DatasetUpdate.ALREADY_UPDATED
    except FileNotFoundError:
        r = requests.get(html_url)
        html = r.text
        save_html_file = True

    results = parse_totals(country, html)

    if results is None:
        if check_only:
            return DatasetUpdate.UPDATE_AVAILABLE
        sys.stderr.write(
            "Can't find numbers. Perhaps the page format has changed?\n")
        sys.exit(1)
    elif results["Date"] != date:
        if check_only:
            return DatasetUpdate.UPDATE_NOT_AVAILABLE
        sys.stderr.write("Page is dated {}, but want {}\n".format(
            results["Date"], date))
        sys.exit(1)

    if check_only:
        return DatasetUpdate.UPDATE_AVAILABLE

    daily_areas = parse_daily_areas(date, country, html)

    print_totals(results)
    #save_indicators(results)
    save_indicators_to_sqlite(results)

    if daily_areas is not None:
        #save_daily_areas(date, country, daily_areas)
        save_daily_areas_to_sqlite(date, country, daily_areas)

    if save_html_file:
        with open(local_html_file, "w") as f:
            f.write(html)