Exemplo n.º 1
0
def parse_daily_areas_pdf(date, country, local_pdf_file):
    if country == "Northern Ireland":
        pdf = pdfplumber.open(local_pdf_file)
        for page in pdf.pages:
            try:
                table = page.extract_table()
                if table[0][0] == "Local Government District":
                    output_rows = [[
                        "Date", "Country", "AreaCode", "Area", "TotalCases"
                    ]]
                    for table_row in table[1:]:
                        if table_row[0].lower() == "total":
                            continue
                        area = normalize_whitespace(titlecase(table_row[0]))
                        area = area.replace("Ards and North Down",
                                            "North Down and Ards")
                        area_code = lookup_local_government_district_code(area)
                        cases = normalize_int(table_row[1])
                        output_row = [date, country, area_code, area, cases]
                        output_rows.append(output_row)
                    return output_rows
            except IndexError:
                pass  # no table on page
    elif country == "Wales":
        pdf = pdfplumber.open(local_pdf_file)
        for page in pdf.pages:
            try:
                table = page.extract_table(
                    table_settings={
                        # use text alignment since the table doesn't have lines
                        "vertical_strategy": "text",
                        "horizontal_strategy": "text"
                    })
                found_start = False
                output_rows = [[
                    "Date", "Country", "AreaCode", "Area", "TotalCases"
                ]]
                for table_row in table:
                    if table_row[0] is not None and table_row[0].startswith(
                            "Aneurin"):
                        found_start = True
                    if found_start:
                        area = (
                            normalize_whitespace(table_row[2]).replace(
                                "Anglesey", "Isle of Anglesey").replace(
                                    "ff", "ff")  # fix ligatures
                            .replace("fi", "fi"))
                        if area.startswith("Wales total"):
                            continue
                        area_code = lookup_local_authority_code(area)
                        cases = normalize_int(table_row[4])
                        output_row = [date, country, area_code, area, cases]
                        output_rows.append(output_row)
                    if table_row[2] is not None and normalize_whitespace(
                            table_row[2]) == 'Resident outside Wales':
                        break
                return convert_wales_la_to_hb(date, country, output_rows)
            except IndexError:
                pass  # no table on page
    return None
Exemplo n.º 2
0
def crawl_arcgis(date, country, check_only):
    if country == "UK":
        item_id = "bc8ee90225644ef7a6f4dd1b13ea1d67"
        local_data_file = "data/raw/DailyIndicators-{}.xslx".format(date)
        ret = download_arcgis_item(date, item_id, local_data_file, check_only)
        if check_only:
            return ret

        df = pd.read_excel(local_data_file)
        print(df)

        d = df.to_dict("records")[0]
        date = d["DateVal"].strftime("%Y-%m-%d")

        with sqlite3.connect('data/covid-19-uk.db') as conn:
            c = conn.cursor()
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'ConfirmedCases', {d['TotalUKCases']})"
            )
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'Deaths', {d['TotalUKDeaths']})"
            )
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'ConfirmedCases', {d['EnglandCases']})"
            )
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'Deaths', {d['EnglandDeaths']})"
            )
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Scotland', 'ConfirmedCases', {d['ScotlandCases']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Scotland', 'Deaths', {d['ScotlandDeaths']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Wales', 'ConfirmedCases', {d['WalesCases']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Wales', 'Deaths', {d['WalesDeaths']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Northern Ireland', 'ConfirmedCases', {d['NICases']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Northern Ireland', 'Deaths', {d['NIDeaths']})")

    elif country == "England":
        item_id = "b684319181f94875a6879bbc833ca3a6"
        local_data_file = "data/raw/CountyUAs_cases_table-{}.csv".format(date)
        ret = download_arcgis_item(date, item_id, local_data_file, check_only)
        if check_only:
            return ret

        df = pd.read_csv(local_data_file)
        df["Date"] = date
        df["Country"] = "England"
        df = df.rename(columns={"GSS_CD": "AreaCode", "GSS_NM": "Area"})
        df = df[["Date", "Country", "AreaCode", "Area", "TotalCases"]]
        daily_areas = df.to_dict("split")["data"]
        for row in daily_areas:
            row[4] = normalize_int(normalize_whitespace(row[4]))
        daily_areas = [["Date", "Country", "AreaCode", "Area", "TotalCases"]
                       ] + daily_areas

        #save_daily_areas(date, country, daily_areas)
        save_daily_areas_to_sqlite(date, country, daily_areas)
Exemplo n.º 3
0
def parse_tests(country, html):

    def is_testing_table(table):
        headers = [th.text for th in table.findAll("th")]
        return "Tests" in headers

    soup = BeautifulSoup(html, features="html.parser")
    tables = soup.find_all("table")
    testing_tables = [table for table in tables if is_testing_table(table)]
    if len(testing_tables) == 0:
        print("Testing table not found")
        return None
    elif len(testing_tables) > 1:
        print("More than one testing table found")
        return None
    testing_table = testing_tables[0]
    table_rows = testing_table.findAll("tr")
    if len(table_rows) != 3:
        print("Expecting 3 table rows")
        return None
    daily_row = [td.text for td in table_rows[1].findAll("td")]
    total_row = [td.text for td in table_rows[2].findAll("td")]

    text = get_text_from_html(html)
    pattern_dict = {
        "Date": (r"As of (?P<Time>\d+\s*(am|pm)?) (on )?(?P<Date>.+?),", date_value_parser_fn)
    }
    result = parse_totals_general(pattern_dict, country, text)
    result["DailyTestsPerformed"] = normalize_int(daily_row[1])
    result["DailyPeopleTested"] = normalize_int(daily_row[2])
    result["DailyPositive"] = normalize_int(daily_row[3])
    result["TotalTestsPerformed"] = normalize_int(total_row[1])
    result["TotalPeopleTested"] = normalize_int(total_row[2])
    result["TotalPositive"] = normalize_int(total_row[3])
    return result
Exemplo n.º 4
0
def parse_daily_areas_json(date, country, json_data):
    if country == "England":
        output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
        for area_code, o in json_data["utlas"].items():
            area = o["name"]["value"]
            cases = normalize_int(o["totalCases"]["value"])
            if area_code != lookup_local_authority_code(area):
                print("Area code mismatch for {}, JSON file gave {}, but lookup was {}".format(area, area_code, lookup_local_authority_code(area)))
                return None
            output_row = [date, country, area_code, area, cases]
            output_rows.append(output_row)
        return output_rows

    return None
Exemplo n.º 5
0
def parse_totals_pdf(date, country, local_pdf_file):
    if country == "Northern Ireland":
        text = get_text_from_pdf(local_pdf_file)
        pattern_dict = {
            "Date": (r"Date generated: (?P<Date>[\d,]+/[\d,]+/[\d,]+)",
                     date_value_parser_fn),
            "Tests":
            (r"Number of Individuals tested( for COVID-19| for SARS-COV2 Virus)?:? (?P<Tests>[\d,]+)",
             int_value_parser_fn),
            "ConfirmedCases":
            (r"(Number of Individuals (with confirmed|testing positive for) (COVID-19|SARS-COV2 Virus)|Cumulative number of laboratory confirmed COVID-19 cases):? (?P<ConfirmedCases>[\d,]+)",
             int_value_parser_fn),
            "Deaths":
            (r"(Total|Cumulative) number of (Trust |reported )?deaths( associated with COVID-19)?: (?P<Deaths>[\d,]+)",
             int_value_parser_fn),
        }
        result = parse_totals_general(pattern_dict, country, text)
        return result
    elif country == "Wales":
        pdf = pdfplumber.open(local_pdf_file)
        for page in pdf.pages:
            try:
                table = page.extract_table(
                    table_settings={
                        # use text alignment since the table doesn't have lines
                        "horizontal_strategy": "text"
                    })
                result = {"Date": date, "Country": country}
                for table_row in table:
                    if table_row[0] == "":
                        continue
                    label = table_row[0].replace("\t", " ")
                    value = normalize_int(table_row[1])
                    if label == "Cumulative individuals tested":
                        result["Tests"] = value
                    elif label == "Cumulative confirmed COVID-19 case total":
                        result["ConfirmedCases"] = value
                    elif label == "Cumulative number of suspected COVID-19 deaths* reported to PHW":
                        # Get deaths from XLSX after this date
                        if date < "2020-04-29":
                            result["Deaths"] = value
                return result
            except IndexError:
                pass  # no table on page
    return None
Exemplo n.º 6
0
def parse_daily_areas_pdf(date, country, local_pdf_file):
    if country == "Northern Ireland":
        pdf = pdfplumber.open(local_pdf_file)
        for page in pdf.pages:
            try:
                table = page.extract_table()
                if table[0][0] == "Local Government District":
                    output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
                    for table_row in table[1:]:
                        if table_row[0].lower() == "total":
                            continue
                        area = normalize_whitespace(titlecase(table_row[0]))
                        area_code = lookup_local_government_district_code(area)
                        cases = normalize_int(table_row[1])
                        output_row = [date, country, area_code, area, cases]
                        output_rows.append(output_row)
                    return output_rows
            except IndexError:
                pass # no table on page
    return None
Exemplo n.º 7
0
def int_value_parser_fn(value):
    return normalize_int(value)
Exemplo n.º 8
0
def test_normalize_int():
    assert normalize_int("1") == 1
    assert normalize_int("1,001") == 1001
    assert normalize_int("seven") == 7
Exemplo n.º 9
0
def parse_tests(country, html):
    def is_testing_table(table):
        headers = [th.text for th in table.findAll("th")]
        return "Tests" in headers

    soup = BeautifulSoup(html, features="html.parser")
    tables = soup.find_all("table")
    testing_tables = [table for table in tables if is_testing_table(table)]
    if len(testing_tables) == 0:
        print("Testing table not found")
        return None
    elif len(testing_tables) > 1:
        print("More than one testing table found")
        return None
    testing_table = testing_tables[0]
    table_rows = testing_table.findAll("tr")
    if len(table_rows) != 3:
        print("Expecting 3 table rows")
        return None
    daily_row = [td.text for td in table_rows[1].findAll("td")]
    total_row = [td.text for td in table_rows[2].findAll("td")]

    text = get_text_from_html(html)
    pattern_dict = {
        "Date": (r"As of (?P<Time>\d+\s*(am|pm)?) (on )?(?P<Date>.+?),",
                 date_value_parser_fn)
    }
    result = parse_totals_general(pattern_dict, country, text)
    result["DailyTestsPerformed"] = normalize_int(daily_row[1])
    result["DailyPeopleTested"] = normalize_int_with_unavailable(daily_row[2])
    result["DailyPositive"] = normalize_int(daily_row[3])
    result["TotalTestsPerformed"] = normalize_int(total_row[1])
    result["TotalPeopleTested"] = normalize_int_with_unavailable(total_row[2])
    result["TotalPositive"] = normalize_int(total_row[3])

    def is_pillar_table(table):
        headers = [th.text for th in table.findAll("th")]
        return "Pillar 1" in headers

    pillar_tables = [table for table in tables if is_pillar_table(table)]

    if len(pillar_tables) == 0:
        # no pillar tables
        return result
    elif len(pillar_tables) != 2:
        print("Expecting two pillar tables (daily and cumulative)")
        return None

    for table_num, pillar_table in enumerate(pillar_tables):
        daily_or_total = "Daily" if table_num == 0 else "Total"
        table_rows = pillar_table.findAll("tr")
        if len(table_rows) != 4:
            print("Expecting 4 table rows")
            return None
        for i, col in enumerate(table_rows[0].findAll(re.compile("th|td"))):
            if col.text.startswith("Pillar"):
                pillar = remove_whitespace(col.text)
                for row in table_rows[1:]:
                    test_stat = normalize_whitespace(row.findAll("td")[0].text)
                    if test_stat == "Tests":
                        test_stat = "TestsPerformed"
                    elif test_stat == "People tested":
                        test_stat = "PeopleTested"
                    indicator = "{}{}{}".format(daily_or_total, pillar,
                                                test_stat)
                    str_val = row.findAll("td")[i].text
                    val = "" if str_val == "-" else normalize_int_with_unavailable(
                        str_val)
                    result[indicator] = val

    def is_pillar2_breakdown_table(table):
        headers = [th.text for th in table.findAll("th")]
        return any([header.startswith("In-person") for header in headers])

    pillar2_breakdown_tables = [
        table for table in tables if is_pillar2_breakdown_table(table)
    ]
    if len(pillar2_breakdown_tables) == 0:
        # no pillar 2 breakdown table
        return result
    elif len(pillar2_breakdown_tables) > 1:
        print("More than one pillar 2 breakdown table found")
        return None
    pillar2_breakdown_table = pillar2_breakdown_tables[0]
    table_rows = pillar2_breakdown_table.findAll("tr")
    if len(table_rows) not in (3, 4):
        print("Expecting 3 (or 4) table rows in pillar 2 breakdown table")
        return None
    daily_row = [td.text for td in table_rows[1].findAll("td")]
    total_row = [td.text for td in table_rows[2].findAll("td")]
    result["DailyPillar2InPersonRoutes"] = normalize_int(daily_row[1])
    result["DailyPillar2DeliveryRoutes"] = normalize_int(daily_row[2])
    result["TotalPillar2InPersonRoutes"] = normalize_int(total_row[1])
    result["TotalPillar2DeliveryRoutes"] = normalize_int(total_row[2])

    return result
Exemplo n.º 10
0
def normalize_int_with_unavailable(num):
    if num.lower() == "unavailable":
        return ""
    return normalize_int(num)
Exemplo n.º 11
0
soup = BeautifulSoup(html, features="html.parser")

text = soup.get_text()
text = text.replace(u"\xa0",
                    u" ")  # replace non-breaking spaces with regular spaces

patterns = [uk_pattern, wales_pattern, scotland_pattern, ni_pattern]

for pattern in patterns:
    m = re.search(pattern, text)
    if m is not None:
        groups = m.groupdict()
        date = dateparser.parse(groups["date"]).strftime("%Y-%m-%d")
        country = normalize_whitespace(groups.get("country")).replace(
            "Scottish", "Scotland")
        tests = normalize_int(groups.get("tests", float("nan")))
        positive_tests = normalize_int(groups["positive_tests"])
        negative_tests = normalize_int(
            groups.get("negative_tests", float("nan")))
        deaths = normalize_int(groups.get("deaths", float("nan")))
        if not math.isnan(tests):
            print("{},{},{},{}".format(date, country, "Tests", tests))
            # with open(
            #     "data/daily/indicators/covid-19-{}-{}-tests.csv".format(
            #         date, format_country(country)
            #     ),
            #     "w",
            # ) as f:
            #     f.write("{},{},{},{}\n".format(date, country, "Tests", tests))
        if not math.isnan(positive_tests):
            print("{},{},{},{}".format(date, country, "ConfirmedCases",