Exemplo n.º 1
0
def parse_daily_areas_pdf(date, country, local_pdf_file):
    if country == "Northern Ireland":
        pdf = pdfplumber.open(local_pdf_file)
        for page in pdf.pages:
            try:
                table = page.extract_table()
                if table[0][0] == "Local Government District":
                    output_rows = [[
                        "Date", "Country", "AreaCode", "Area", "TotalCases"
                    ]]
                    for table_row in table[1:]:
                        if table_row[0].lower() == "total":
                            continue
                        area = normalize_whitespace(titlecase(table_row[0]))
                        area = area.replace("Ards and North Down",
                                            "North Down and Ards")
                        area_code = lookup_local_government_district_code(area)
                        cases = normalize_int(table_row[1])
                        output_row = [date, country, area_code, area, cases]
                        output_rows.append(output_row)
                    return output_rows
            except IndexError:
                pass  # no table on page
    elif country == "Wales":
        pdf = pdfplumber.open(local_pdf_file)
        for page in pdf.pages:
            try:
                table = page.extract_table(
                    table_settings={
                        # use text alignment since the table doesn't have lines
                        "vertical_strategy": "text",
                        "horizontal_strategy": "text"
                    })
                found_start = False
                output_rows = [[
                    "Date", "Country", "AreaCode", "Area", "TotalCases"
                ]]
                for table_row in table:
                    if table_row[0] is not None and table_row[0].startswith(
                            "Aneurin"):
                        found_start = True
                    if found_start:
                        area = (
                            normalize_whitespace(table_row[2]).replace(
                                "Anglesey", "Isle of Anglesey").replace(
                                    "ff", "ff")  # fix ligatures
                            .replace("fi", "fi"))
                        if area.startswith("Wales total"):
                            continue
                        area_code = lookup_local_authority_code(area)
                        cases = normalize_int(table_row[4])
                        output_row = [date, country, area_code, area, cases]
                        output_rows.append(output_row)
                    if table_row[2] is not None and normalize_whitespace(
                            table_row[2]) == 'Resident outside Wales':
                        break
                return convert_wales_la_to_hb(date, country, output_rows)
            except IndexError:
                pass  # no table on page
    return None
Exemplo n.º 2
0
def parse_daily_areas(date, country, html):
    if country in ("Northern Ireland", "UK"):
        return None
    soup = BeautifulSoup(html, features="html.parser")
    output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
    if country == "Scotland":
        table = soup.find_all("table")[0]
        for table_row in table.findAll("tr"):
            columns = [
                normalize_whitespace(col.text) for col in table_row.findAll("td")
            ]
            if len(columns) == 0:
                continue
            if columns[0].lower() in ("", "health board"):
                continue
            area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran")
            area = columns[0].replace("Eileanan Siar (Western Isles)", "Western Isles")
            area_code = lookup_health_board_code(area)
            cases = columns[1]
            if cases == "*": # means 5 or fewer cases
                cases = "NaN"
            else:
                cases = cases.replace("*", "").replace(",", "")
            output_row = [date, country, area_code, area, cases]
            output_rows.append(output_row)
        return output_rows
    elif country == "Wales":
        if date >= "2020-04-08":
            # daily areas no longer published on the HTML page (now published on the dashboard)
            return None
        table = soup.find_all("table")[0]
        for table_row in table.findAll("tr"):
            columns = [
                normalize_whitespace(col.text) for col in table_row.findAll("td")
            ]
            if len(columns) == 0:
                continue
            if columns[0].lower() in ("", "health board", "wales", "total", "wales total"):
                continue
            if is_blank(columns[-1]):
                continue
            area = (
                columns[0]
                .replace("City and County of Swansea", "Swansea")
                .replace("City of Cardiff", "Cardiff")
                .replace("Newport City", "Newport")
                .replace("County Borough Council", "")
                .replace("County Council", "")
                .replace("Council", "")
                .replace("Cardiff & Vale", "Cardiff and Vale")
                .replace("Cwm Taf Morgannwg", "Cwm Taf")
                .strip()
            )
            if is_blank(area):
                area = columns[0]
            cases = columns[-1].replace("*","").replace(",", "")
            output_row = [date, country, lookup_health_board_code(area), area, cases]
            output_rows.append(output_row)
        return output_rows
    return None
Exemplo n.º 3
0
def parse_daily_areas(date, country, html):
    if country in ("Northern Ireland", "UK"):
        return None
    soup = BeautifulSoup(html, features="html.parser")
    output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
    if country == "Scotland":
        table = soup.find_all("table")[-1]
        for table_row in table.findAll("tr"):
            columns = [
                normalize_whitespace(col.text)
                for col in table_row.findAll("td")
            ]
            if len(columns) == 0:
                continue
            if columns[0].lower() in ("", "health board"):
                continue
            area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran")
            area_code = lookup_health_board_code(area)
            cases = columns[1].replace("*", "")
            output_row = [date, country, area_code, area, cases]
            output_rows.append(output_row)
        return output_rows
    elif country == "Wales":
        table = soup.find_all("table")[0]
        for table_row in table.findAll("tr"):
            columns = [
                normalize_whitespace(col.text)
                for col in table_row.findAll("td")
            ]
            if len(columns) == 0:
                continue
            if columns[0].lower() in ("", "health board", "wales", "total",
                                      "wales total"):
                continue
            if is_blank(columns[-1]):
                continue
            area = (columns[0].replace(
                "City and County of Swansea",
                "Swansea").replace("City of Cardiff", "Cardiff").replace(
                    "Newport City",
                    "Newport").replace("County Borough Council",
                                       "").replace("County Council",
                                                   "").replace("Council",
                                                               "").strip())
            if is_blank(area):
                area = columns[0]
            cases = columns[-1]
            output_row = [
                date, country,
                lookup_health_board_code(area), area, cases
            ]
            output_rows.append(output_row)
        return output_rows
    return None
Exemplo n.º 4
0
def crawl_arcgis(date, country, check_only):
    if country == "UK":
        item_id = "bc8ee90225644ef7a6f4dd1b13ea1d67"
        local_data_file = "data/raw/DailyIndicators-{}.xslx".format(date)
        ret = download_arcgis_item(date, item_id, local_data_file, check_only)
        if check_only:
            return ret

        df = pd.read_excel(local_data_file)
        print(df)

        d = df.to_dict("records")[0]
        date = d["DateVal"].strftime("%Y-%m-%d")

        with sqlite3.connect('data/covid-19-uk.db') as conn:
            c = conn.cursor()
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'ConfirmedCases', {d['TotalUKCases']})"
            )
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'Deaths', {d['TotalUKDeaths']})"
            )
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'ConfirmedCases', {d['EnglandCases']})"
            )
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'Deaths', {d['EnglandDeaths']})"
            )
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Scotland', 'ConfirmedCases', {d['ScotlandCases']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Scotland', 'Deaths', {d['ScotlandDeaths']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Wales', 'ConfirmedCases', {d['WalesCases']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Wales', 'Deaths', {d['WalesDeaths']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Northern Ireland', 'ConfirmedCases', {d['NICases']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Northern Ireland', 'Deaths', {d['NIDeaths']})")

    elif country == "England":
        item_id = "b684319181f94875a6879bbc833ca3a6"
        local_data_file = "data/raw/CountyUAs_cases_table-{}.csv".format(date)
        ret = download_arcgis_item(date, item_id, local_data_file, check_only)
        if check_only:
            return ret

        df = pd.read_csv(local_data_file)
        df["Date"] = date
        df["Country"] = "England"
        df = df.rename(columns={"GSS_CD": "AreaCode", "GSS_NM": "Area"})
        df = df[["Date", "Country", "AreaCode", "Area", "TotalCases"]]
        daily_areas = df.to_dict("split")["data"]
        for row in daily_areas:
            row[4] = normalize_int(normalize_whitespace(row[4]))
        daily_areas = [["Date", "Country", "AreaCode", "Area", "TotalCases"]
                       ] + daily_areas

        #save_daily_areas(date, country, daily_areas)
        save_daily_areas_to_sqlite(date, country, daily_areas)
Exemplo n.º 5
0
def parse_daily_areas_pdf(date, country, local_pdf_file):
    if country == "Northern Ireland":
        pdf = pdfplumber.open(local_pdf_file)
        for page in pdf.pages:
            try:
                table = page.extract_table()
                if table[0][0] == "Local Government District":
                    output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
                    for table_row in table[1:]:
                        if table_row[0].lower() == "total":
                            continue
                        area = normalize_whitespace(titlecase(table_row[0]))
                        area_code = lookup_local_government_district_code(area)
                        cases = normalize_int(table_row[1])
                        output_row = [date, country, area_code, area, cases]
                        output_rows.append(output_row)
                    return output_rows
            except IndexError:
                pass # no table on page
    return None
Exemplo n.º 6
0
def get_text_from_html(html):
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text(separator=" ")
    text = normalize_whitespace(text)
    return text
Exemplo n.º 7
0
def get_text_from_pdf(local_pdf_file):
    pdf = pdfplumber.open(local_pdf_file)
    page = pdf.pages[0] # just extract first page
    text = page.extract_text()
    text = normalize_whitespace(text)
    return text
Exemplo n.º 8
0
def test_normalize_whitespace():
    assert normalize_whitespace("  a  b ") == "a b"
Exemplo n.º 9
0
def parse_tests(country, html):
    def is_testing_table(table):
        headers = [th.text for th in table.findAll("th")]
        return "Tests" in headers

    soup = BeautifulSoup(html, features="html.parser")
    tables = soup.find_all("table")
    testing_tables = [table for table in tables if is_testing_table(table)]
    if len(testing_tables) == 0:
        print("Testing table not found")
        return None
    elif len(testing_tables) > 1:
        print("More than one testing table found")
        return None
    testing_table = testing_tables[0]
    table_rows = testing_table.findAll("tr")
    if len(table_rows) != 3:
        print("Expecting 3 table rows")
        return None
    daily_row = [td.text for td in table_rows[1].findAll("td")]
    total_row = [td.text for td in table_rows[2].findAll("td")]

    text = get_text_from_html(html)
    pattern_dict = {
        "Date": (r"As of (?P<Time>\d+\s*(am|pm)?) (on )?(?P<Date>.+?),",
                 date_value_parser_fn)
    }
    result = parse_totals_general(pattern_dict, country, text)
    result["DailyTestsPerformed"] = normalize_int(daily_row[1])
    result["DailyPeopleTested"] = normalize_int_with_unavailable(daily_row[2])
    result["DailyPositive"] = normalize_int(daily_row[3])
    result["TotalTestsPerformed"] = normalize_int(total_row[1])
    result["TotalPeopleTested"] = normalize_int_with_unavailable(total_row[2])
    result["TotalPositive"] = normalize_int(total_row[3])

    def is_pillar_table(table):
        headers = [th.text for th in table.findAll("th")]
        return "Pillar 1" in headers

    pillar_tables = [table for table in tables if is_pillar_table(table)]

    if len(pillar_tables) == 0:
        # no pillar tables
        return result
    elif len(pillar_tables) != 2:
        print("Expecting two pillar tables (daily and cumulative)")
        return None

    for table_num, pillar_table in enumerate(pillar_tables):
        daily_or_total = "Daily" if table_num == 0 else "Total"
        table_rows = pillar_table.findAll("tr")
        if len(table_rows) != 4:
            print("Expecting 4 table rows")
            return None
        for i, col in enumerate(table_rows[0].findAll(re.compile("th|td"))):
            if col.text.startswith("Pillar"):
                pillar = remove_whitespace(col.text)
                for row in table_rows[1:]:
                    test_stat = normalize_whitespace(row.findAll("td")[0].text)
                    if test_stat == "Tests":
                        test_stat = "TestsPerformed"
                    elif test_stat == "People tested":
                        test_stat = "PeopleTested"
                    indicator = "{}{}{}".format(daily_or_total, pillar,
                                                test_stat)
                    str_val = row.findAll("td")[i].text
                    val = "" if str_val == "-" else normalize_int_with_unavailable(
                        str_val)
                    result[indicator] = val

    def is_pillar2_breakdown_table(table):
        headers = [th.text for th in table.findAll("th")]
        return any([header.startswith("In-person") for header in headers])

    pillar2_breakdown_tables = [
        table for table in tables if is_pillar2_breakdown_table(table)
    ]
    if len(pillar2_breakdown_tables) == 0:
        # no pillar 2 breakdown table
        return result
    elif len(pillar2_breakdown_tables) > 1:
        print("More than one pillar 2 breakdown table found")
        return None
    pillar2_breakdown_table = pillar2_breakdown_tables[0]
    table_rows = pillar2_breakdown_table.findAll("tr")
    if len(table_rows) not in (3, 4):
        print("Expecting 3 (or 4) table rows in pillar 2 breakdown table")
        return None
    daily_row = [td.text for td in table_rows[1].findAll("td")]
    total_row = [td.text for td in table_rows[2].findAll("td")]
    result["DailyPillar2InPersonRoutes"] = normalize_int(daily_row[1])
    result["DailyPillar2DeliveryRoutes"] = normalize_int(daily_row[2])
    result["TotalPillar2InPersonRoutes"] = normalize_int(total_row[1])
    result["TotalPillar2DeliveryRoutes"] = normalize_int(total_row[2])

    return result
Exemplo n.º 10
0
from util import normalize_whitespace, lookup_health_board_code

html_file = sys.argv[1]
csv_file = sys.argv[2]

m = re.match(".+-(.+)-(\d{4}-\d{2}-\d{2})\.html", html_file)
country = m.group(1).title()
date = m.group(2)

html = open(html_file).read()
soup = BeautifulSoup(html, features="html.parser")
table = soup.find_all("table")[-1]

output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
for table_row in table.findAll("tr"):
    columns = [
        normalize_whitespace(col.text) for col in table_row.findAll("td")
    ]
    if len(columns) == 0:
        continue
    area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran")
    area_code = lookup_health_board_code(area)
    cases = columns[1]
    output_row = [date, country, area_code, area, cases]
    output_rows.append(output_row)

with open(csv_file, "w") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(output_rows)
Exemplo n.º 11
0
html = open(html_file).read()
soup = BeautifulSoup(html, features="html.parser")

text = soup.get_text()
text = text.replace(u"\xa0",
                    u" ")  # replace non-breaking spaces with regular spaces

patterns = [uk_pattern, wales_pattern, scotland_pattern, ni_pattern]

for pattern in patterns:
    m = re.search(pattern, text)
    if m is not None:
        groups = m.groupdict()
        date = dateparser.parse(groups["date"]).strftime("%Y-%m-%d")
        country = normalize_whitespace(groups.get("country")).replace(
            "Scottish", "Scotland")
        tests = normalize_int(groups.get("tests", float("nan")))
        positive_tests = normalize_int(groups["positive_tests"])
        negative_tests = normalize_int(
            groups.get("negative_tests", float("nan")))
        deaths = normalize_int(groups.get("deaths", float("nan")))
        if not math.isnan(tests):
            print("{},{},{},{}".format(date, country, "Tests", tests))
            # with open(
            #     "data/daily/indicators/covid-19-{}-{}-tests.csv".format(
            #         date, format_country(country)
            #     ),
            #     "w",
            # ) as f:
            #     f.write("{},{},{},{}\n".format(date, country, "Tests", tests))
        if not math.isnan(positive_tests):