Пример #1
0
def generate_csv():
    print(
        "Date,Country,DailyTestsPerformed,TotalTestsPerformed,DailyPeopleTested,TotalPeopleTested"
    )
    for file in sorted(
            glob.glob(
                "data/raw/coronavirus-covid-19-number-of-cases-in-uk-*.html")):
        m = re.match(r".+(\d{4}-\d{2}-\d{2})\.html", file)
        date = m.group(1)
        with open(file) as f:
            html = f.read()

        if date <= "2020-03-22":
            # older pages cannot be parsed with current parser
            continue

        if date <= "2020-04-07":
            result = parse_totals("UK", html)
            print("{},UK,,,,{}".format(date, result["Tests"]))
            continue

        result = parse_tests("UK", html)
        output_row = [
            date, "UK", result["DailyTestsPerformed"],
            result["TotalTestsPerformed"], result["DailyPeopleTested"],
            result["TotalPeopleTested"]
        ]
        print(",".join([str(val) for val in output_row]))
def generate_csv():
    indicator_tuples = list(itertools.product(
        ["", "Pillar1", "Pillar2", "Pillar4"],
        ["TestsPerformed", "PeopleTested", "Positive"],
        ["Daily", "Total"],
    ))
    indicators = ["{}{}{}".format(t[2], t[0], t[1]) for t in indicator_tuples] + ["DailyPillar2InPersonRoutes", "DailyPillar2DeliveryRoutes", "TotalPillar2InPersonRoutes", "TotalPillar2DeliveryRoutes"]
    columns = ["Date", "Country"] + indicators
    print(",".join(columns))
    for file in sorted(glob.glob("data/raw/coronavirus-covid-19-number-of-cases-in-uk-*.html")):
        m = re.match(r".+(\d{4}-\d{2}-\d{2})\.html", file)
        date = m.group(1)
        with open(file) as f:
            html = f.read()

        if date <= "2020-03-22":
            # older pages cannot be parsed with current parser
            continue

        # if date != "2020-04-28":
        #     continue

        if date <= "2020-04-07":
            totals_result = parse_totals("UK", html)
            result = { "TotalPeopleTested": totals_result["Tests"] }        
        else:
            result = parse_tests("UK", html)

        indicator_values = [result.get(indicator, "") for indicator in indicators]
        output_row = [date, "UK"] + indicator_values
        print(",".join([str(val) for val in output_row]))
Пример #3
0
def test_parse_totals_uk():
    for file in sorted(glob.glob("data/raw/coronavirus-covid-19-number-of-cases-in-uk-*.html")):
        m = re.match(r".+(\d{4}-\d{2}-\d{2})\.html", file)
        date = m.group(1)
        if date <= "2020-03-22":
            # older pages cannot be parsed with current parser
            continue
        with open(file) as f:
            html = f.read()
            result = parse_totals("UK", html)
            assert result["Country"] == "UK"
            assert result["Date"] == date
            assert result["Tests"] >= 0
            assert result["ConfirmedCases"] >= 0
            assert result["Deaths"] >= 0
Пример #4
0
def crawl_html(date, country, check_only):
    html_url = get_html_url(date, country)
    local_html_file = "data/raw/coronavirus-covid-19-number-of-cases-in-{}-{}.html".format(
        format_country(country), date)
    save_html_file = False

    try:
        with open(local_html_file) as f:
            html = f.read()
        if check_only:
            return DatasetUpdate.ALREADY_UPDATED
    except FileNotFoundError:
        r = requests.get(html_url)
        html = r.text
        save_html_file = True

    results = parse_totals(country, html)

    if results is None:
        if check_only:
            return DatasetUpdate.UPDATE_AVAILABLE
        sys.stderr.write(
            "Can't find numbers. Perhaps the page format has changed?\n")
        sys.exit(1)
    elif results["Date"] != date:
        if check_only:
            return DatasetUpdate.UPDATE_NOT_AVAILABLE
        sys.stderr.write("Page is dated {}, but want {}\n".format(
            results["Date"], date))
        sys.exit(1)

    if check_only:
        return DatasetUpdate.UPDATE_AVAILABLE

    daily_areas = parse_daily_areas(date, country, html)

    print_totals(results)
    #save_indicators(results)
    save_indicators_to_sqlite(results)

    if daily_areas is not None:
        #save_daily_areas(date, country, daily_areas)
        save_daily_areas_to_sqlite(date, country, daily_areas)

    if save_html_file:
        with open(local_html_file, "w") as f:
            f.write(html)