예제 #1
0
        def parse_age_group(age_group: str):
            new_age_group = ""
            age_bins = age_group.strip().replace("+", "-").split("-", 1)
            age_lo = safe_int_cast(age_bins[0])
            new_age_group += f"{age_lo:02d}-"

            if len(age_bins) > 1 and age_bins[1]:
                age_hi = safe_int_cast(age_bins[1])
                new_age_group += f"{age_hi:02d}"

            return new_age_group
예제 #2
0
def _get_province_records(key: str) -> List[Dict[str, Any]]:
    records = []
    url = _api_url_tpl.format(key.replace("_", "-"))
    for record in requests.get(url, timeout=60).json():
        records.append({
            "key": key,
            "date": datetime_isoformat(record["date"], "%d.%m.%Y"),
            "total_confirmed": safe_int_cast(record["sick"]),
            "total_deceased": safe_int_cast(record["died"]),
            "total_recovered": safe_int_cast(record["healed"]),
        })
    return records
예제 #3
0
def update_table(table_name: str = None,
                 job_group: str = None,
                 parallel_jobs: int = 8) -> Response:
    table_name = _get_request_param("table", table_name)
    job_group = _get_request_param("job_group", job_group)
    process_count = _get_request_param("parallel_jobs", parallel_jobs)
    # Default to 1 if invalid process count is given
    process_count = safe_int_cast(process_count) or 1

    # Early exit: table name not found
    if table_name not in list(get_table_names()):
        return Response(f"Invalid table name {table_name}", status=400)

    with temporary_directory() as workdir:
        (workdir / "snapshot").mkdir(parents=True, exist_ok=True)
        (workdir / "intermediate").mkdir(parents=True, exist_ok=True)

        # Load the pipeline configuration given its name
        pipeline_name = table_name.replace("-", "_")
        data_pipeline = DataPipeline.load(pipeline_name)

        # Limit the sources to only the job_group provided
        if job_group is not None:
            data_pipeline.data_sources = [
                data_source for data_source in data_pipeline.data_sources
                if data_source.config.get("automation", {}).get("job_group") ==
                job_group
            ]

            # Early exit: job group contains no data sources
            if not data_pipeline.data_sources:
                return Response(
                    f"No data sources matched job group {job_group} for table {table_name}",
                    status=400,
                )

        # Log the data sources being extracted
        data_source_names = [
            src.config.get("name") for src in data_pipeline.data_sources
        ]
        logger.log_info(f"Updating data sources: {data_source_names}")

        # When running the data pipeline, use as many parallel processes as allowed and avoid
        # downloading files multiple times.
        run_options = dict(process_count=process_count, skip_existing=True)

        # Produce the intermediate files from the data source
        intermediate_results = data_pipeline.parse(workdir, **run_options)
        data_pipeline._save_intermediate_results(workdir / "intermediate",
                                                 intermediate_results)
        intermediate_files = list(
            map(str, (workdir / "intermediate").glob("*.csv")))
        logger.log_info(f"Created intermediate tables: {intermediate_files}")

        # Upload results to the test bucket because these are not prod files
        upload_folder(GCS_BUCKET_TEST, "snapshot", workdir / "snapshot")
        upload_folder(GCS_BUCKET_TEST, "intermediate",
                      workdir / "intermediate")

    return Response("OK", status=200)
예제 #4
0
    def parse(self, sources: Dict[Any, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        url_tpl = sources[0]
        metadata = aux["metadata"]
        metadata = metadata[metadata["country_code"] == "FR"]

        fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv")
        fr_iso_map = {iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])}
        fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna()
        regions_iter = fr_codes["subregion1_code"].unique()
        deps_iter = [record for _, record in fr_codes.iterrows()]

        column_adapter = {
            "key": "key",
            "date": "date",
            "testsRealisesDetails": "_breakdown_tested",
            "testsPositifsDetails": "_breakdown_confirmed",
        }

        # Get country level data
        country = _get_country(url_tpl, column_adapter)

        # Get region level data
        get_region_func = partial(_get_region, url_tpl, column_adapter, fr_iso_map)
        regions = concat(list(thread_map(get_region_func, regions_iter)))

        # Get department level data
        get_department_func = partial(_get_department, url_tpl, column_adapter)
        departments = concat(list(thread_map(get_department_func, deps_iter)))

        data = concat([country, regions, departments])
        data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))

        data["_breakdown_tested"].fillna("", inplace=True)
        data["_breakdown_confirmed"].fillna("", inplace=True)

        records: Dict[str, List] = {"confirmed": [], "tested": []}
        for key, row in data.set_index("key").iterrows():
            for statistic in records.keys():
                if row[f"_breakdown_{statistic}"] != "":
                    for item in row[f"_breakdown_{statistic}"]:
                        records[statistic].append(
                            {
                                "key": key,
                                "date": row["date"],
                                "age": item["age"],
                                "sex": item.get("sexe"),
                                f"new_{statistic}": item["value"],
                            }
                        )

        df1 = DataFrame.from_records(records["tested"])
        df2 = DataFrame.from_records(records["confirmed"])
        data = df1.merge(df2, how="outer")

        data = data[~data["age"].isin(["0", "A", "B", "C", "D", "E"])]
        data["age"] = data["age"].apply(lambda x: age_group(safe_int_cast(x)))

        sex_adapter = lambda x: {"h": "male", "f": "female"}.get(x, "sex_unknown")
        data["sex"] = data["sex"].apply(sex_adapter)
        return data
예제 #5
0
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        # Rename the appropriate columns
        data = (
            dataframes[0]
            .rename(
                columns={
                    "Location": "match_string",
                    "Confirmed": "total_confirmed",
                    "Deaths": "total_deceased",
                    "Recoveries": "total_recovered",
                    "Date": "date",
                }
            )
            .drop(columns=["Active"])
        )

        # Convert date to ISO format
        data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%m/%d/%Y"))

        # The first row is metadata info about column names - discard it
        data = data[data.match_string != "#loc+name"]

        # Convert string numbers to int
        # Parse integers
        for column in ("total_confirmed", "total_deceased", "total_recovered"):
            data[column] = data[column].apply(lambda x: safe_int_cast(str(x).replace(",", "")))

        # Make sure all records have the country code
        data["country_code"] = "LY"

        # Output the results
        return data
예제 #6
0
    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename the appropriate columns
        data = (dataframes[0].rename(
            columns={
                "Date": "date",
                "Province": "match_string",
                "Cases": "total_confirmed",
                "Deaths": "total_deceased",
                "Active Cases": "current_confirmed",
                "Recoveries": "total_recovered",
            }).drop([0]).drop(axis=1, columns="current_confirmed"))

        # Parse integers
        for column in ("total_confirmed", "total_deceased", "total_recovered"):
            data[column] = data[column].apply(
                lambda x: safe_int_cast(str(x).replace(",", "")))

        # Make sure all records have the country code
        data["country_code"] = "AF"

        # Remove redundant info from names
        data.match_string = data.match_string.apply(
            lambda x: x.replace(" Province", ""))

        # Output the results
        return data
예제 #7
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        cases = table_rename(dataframes[0], _srag_column_adapter, drop=True)
        covid_mask = cases["_classification"] == 5
        valid_mask = cases["_prognosis"].notna() & cases["_prognosis"] != 9
        cases = cases[covid_mask & valid_mask]

        # Record the date of death
        cases["date_new_deceased"] = None
        deceased_mask = cases["_prognosis"] == 2
        cases.loc[deceased_mask,
                  "date_new_deceased"] = cases.loc[deceased_mask,
                                                   "_date_prognosis"]

        # Convert ages to int, and translate sex (no "other" sex/gender reported)
        cases["age"] = cases["age"].apply(safe_int_cast)
        cases["sex"] = cases["sex"].apply({"M": "male", "F": "female"}.get)

        # Convert all dates to ISO format
        for col in filter(lambda x: x.startswith("date"), cases.columns):
            cases[col] = cases[col].apply(
                lambda x: datetime_isoformat(x, "%d/%m/%Y"))

        # Parse subregion codes
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 5))

        # Convert to time series format
        data = convert_cases_to_time_series(cases,
                                            index_columns=["subregion2_code"])
        data["country_code"] = "BR"

        # Get rid of bogus records
        data = data.dropna(subset=["date"])
        data = data[data["date"] >= "2020-01-01"]
        data = data[data["date"] < date_today(offset=1)]

        # Aggregate by country level
        country = (data.drop(columns=["subregion2_code"]).groupby(
            ["date", "age", "sex"]).sum().reset_index())
        country["key"] = "BR"

        # Aggregate by state level
        data["subregion1_code"] = data["subregion2_code"].apply(
            lambda x: _IBGE_STATES.get(safe_int_cast(x[:2])))
        state = (data.drop(columns=["subregion2_code"]).dropna(
            subset=["subregion1_code"]).groupby(
                ["date", "subregion1_code", "age", "sex"]).sum().reset_index())
        state["key"] = "BR_" + state["subregion1_code"]

        # Derive the key from subregion codes
        data = data[data["subregion2_code"].notna()]
        data["key"] = "BR_" + data["subregion1_code"] + "_" + data[
            "subregion2_code"]

        return concat([country, state, data])
예제 #8
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename the appropriate columns
        data = dataframes[0].rename(
            columns={
                # Sometimes the column names are in the local language
                "Dátum": "date",
                "Mintavételek száma (összesen)": "total_tested",
                "mintavételek száma": "new_tested",
                "pozitív esetek száma (összesen)": "total_confirmed",
                "napi pozitív esetszám": "new_confirmed",
                "hospitalizált": "current_hospitalized",
                "intenzív ellátásra szoruló": "current_intensive_care",
                "a kórházból elbocsátottak napi száma": "new_recovered",
                "elhunytak száma összesen": "total_deceased",
                "elhunytak": "new_deceased",
                # Sometimes de column names are in English
                "Date": "date",
                "Tested (all)": "total_tested",
                "Tested (daily)": "new_tested",
                "Positive (all)": "total_confirmed",
                "Positive (daily)": "new_confirmed",
                "All hospitalized on certain day": "current_hospitalized",
                "All persons in intensive care on certain day":
                "current_intensive_care",
                "Discharged": "new_recovered",
                "Deaths (all)": "total_deceased",
                "Deaths (daily)": "new_deceased",
            })

        # It's only country-level data so we can compute the key directly
        data["key"] = "SI"

        # Make sure that the date column is a string
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(str(x)[:10], "%Y-%m-%d"))

        # Drop bogus values
        data.dropna(subset=["date"], inplace=True)

        # Remove non-numeric markers from data fields
        value_columns = [
            col for col in data.columns if any(
                col.startswith(token) for token in ("new", "total", "current"))
        ]
        for col in value_columns:
            data[col] = data[col].apply(
                lambda x: safe_int_cast(str(x).replace("*", "")))

        # Output the results
        return data
예제 #9
0
def _default_age_adapter(value: Any) -> str:
    if isna(value):
        return "age_unknown"

    try:
        value_int = safe_int_cast(value)
        if value_int is not None:
            return age_group(value_int)
        if re.match(r"\d+\-\d*", value):
            return value
    except ValueError:
        pass

    return "age_unknown"
예제 #10
0
def _default_age_adapter(value: Any) -> str:
    if isna(value):
        return "age_unknown"

    try:
        value = str(value)
        if re.match(r"\d+(\.\d*)?", value):
            return age_group(safe_int_cast(value))
        if re.match(r"\d\d?\-\d*", value):
            return value
    except ValueError:
        pass

    return "age_unknown"
예제 #11
0
    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Read data from GitHub repo
        data = {}
        for df, name in zip(dataframes,
                            ("confirmed", "deceased", "recovered")):
            df.rename(columns={"ISO 3166-2 Code": "key"}, inplace=True)
            df.key = df.key.str.replace("-", "_")
            data[name] = df.set_index("key")

        # Transform the data from non-tabulated format to record format
        records = []
        for key in data["confirmed"].index.unique():
            date_columns = [
                col for col in data["confirmed"].columns
                if re.match(r"\d\d\d\d-\d\d-\d\d", col)
            ]
            for col in date_columns:
                records.append({
                    **{
                        "date": col,
                        "key": key
                    },
                    **{
                        f"total_{var}": safe_int_cast(df.loc[key, col])
                        for var, df in data.items()
                    },
                })

        # Zeroes can be considered NaN in this data
        data = DataFrame.from_records(records).replace(0, None)

        # Since it's cumsum data, we can forward fill relatively safely
        data = data.sort_values(["key", "date"
                                 ]).groupby("key").apply(lambda x: x.ffill())

        # We already have CO data directly from the authoritative source, ideally we would match
        # the keys from here to the source which uses DIVIPOLA codes. That's left as a TODO for now.
        data = data[~data.key.str.startswith("CO_")]

        # Correct the French colonies since we are using something different to the ISO 3166-2 code
        data.loc[data.key == "FR_GP", "key"] = "FR_GUA"

        # VE_W are "external territories" of Venezuela, which can't be mapped to any particular
        # geographical region and therefore we exclude them
        data = data[data.key != "VE_W"]

        return data
예제 #12
0
def _process_html_file(file_map: Dict[str, str], date: str) -> Dict[str, Any]:
    records = []

    fname = file_map[date]
    with open(fname, "r") as fd:
        html_content = fd.read()

    # Make sure the tables are exactly what we expect
    age_groups = ["0-17", "18-34", "35-49", "50-64", "65-79", "80-"]
    tables = list(_extract_tables(html_content))[-6:]
    if len(tables) == len(age_groups):
        for age_group, table in zip(age_groups, tables):
            total = table.iloc[-1]
            records.append(
                {
                    "key": "US_CA",
                    "date": date,
                    "age": age_group,
                    "total_confirmed": safe_int_cast(total["No. Cases"]),
                    "total_deceased": safe_int_cast(total["No. Deaths"]),
                }
            )

    return records
예제 #13
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Partition dataframes based on the state the data is for
        partitions = {code: [] for code in _IBGE_STATES.values()}
        for df in dataframes.values():
            df = table_rename(df, _open_data_portal_column_adapter, drop=True)
            apply_func = lambda x: _IBGE_STATES.get(safe_int_cast(x))
            df["subregion1_code"] = df["_state_code"].apply(apply_func)
            for code, group in df.groupby("subregion1_code"):
                partitions[code].append(group)

        # Process each partition in separate threads
        map_opts = dict(desc="Processing Partitions", total=len(partitions))
        map_iter = (concat(chunks) for chunks in partitions.values())
        return concat(process_map(_process_partition, map_iter, **map_opts))
예제 #14
0
    def parse(self, sources: List[str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        data = read_file(sources[0],
                         error_bad_lines=False,
                         encoding="ISO-8859-1")
        data = table_rename(
            data,
            {
                "Date":
                "date",
                "Nombre de personnes en soins normaux":
                "current_hospitalized",
                "Nombre de personnes en soins intensifs (sans patients du Grand Est)":
                "current_intensive_care",
                "Nombre de décès - cumulé (sans patients du Grand Est)":
                "total_deceased",
                "Total patients COVID ayant quitté l'hôpital (hospitalisations stationnaires, données brutes)":
                "new_recovered",
                "Nombre de nouvelles personnes testées COVID+ par jour ":
                "new_tested",
            },
        )

        # Get date in ISO format
        data.date = data.date.apply(
            lambda x: datetime_isoformat(x, "%d/%m/%Y"))

        # Keep only columns we can provess
        data = data[[
            "date",
            "current_hospitalized",
            "current_intensive_care",
            "total_deceased",
            "new_recovered",
            "new_tested",
        ]]

        # Convert recovered into a number
        data.new_recovered = data.new_recovered.apply(
            lambda x: safe_int_cast(x.replace("-", "0")))

        # Only country-level data is provided
        data["key"] = "LU"

        # Output the results
        return data
예제 #15
0
def _default_age_adapter(value: Any) -> str:
    if isna(value):
        return "age_unknown"

    # If the value is already in the form of an age group, return as-is
    if isinstance(value, str) and re.match(r"^\d+\-\d*$", value):
        return value

    # Otherwise assume it's a number and return the corresponding age group
    try:
        value_int = safe_int_cast(value)
        if value_int is not None:
            return age_group(value_int)
    except ValueError:
        pass

    return "age_unknown"
예제 #16
0
    def test_cast_int(self):

        test_data = {
            "1": 1,
            "1.0": 1,
            "-1": -1,
            "-1.0": -1,
            "+1": 1,
            "1e3": 1000,
            "−1": -1,  # Special character dash.
            **TEST_DATA_NULL,
        }

        for value, expected in test_data.items():
            result = safe_int_cast(value)
            self.assertEqual(
                result, expected,
                f"[{value}] Expected: {expected}. Found: {result}")
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        # Parse the data as a list of records
        records = []
        for _, row in dataframes[0].iterrows():
            row = row.to_dict()

            # Parse the start and end dates
            date_start = row["Effective Date"][:10]
            date_end = row["Valid Through Date"][:10]

            # Convert column name and delete unnecessary columns
            row["subregion1_name"] = row["Jurisdictions"]
            del row["Jurisdictions"]
            del row["Effective Date"]
            del row["Valid Through Date"]

            # Insert a record for each date in the range
            for date in date_range(date_start, date_end):
                record = {}
                record["date"] = date
                non_numeric_columns = ("date", "subregion1_name")
                for col, val in row.items():
                    if col in non_numeric_columns:
                        record[col] = val
                    else:
                        record[_convert_column_name(col)] = safe_int_cast(val)
                records.append(record)

        # Convert to DataFrame and add metadata for matching
        data = DataFrame.from_records(records)
        data["country_code"] = "US"
        data["subregion2_code"] = None
        data["locality_code"] = None

        return data
예제 #18
0
def _parse_value(val: Any) -> int:
    if isinstance(val, dict):
        return _parse_value(val["value"])
    else:
        return safe_int_cast(val)
예제 #19
0
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        # Rename the appropriate columns
        data = table_rename(
            dataframes[0],
            {
                "date": "date",
                "age": "age",
                "cum.deaths": "total_deceased",
                "daily.deaths": "new_deceased",
                "code": "subregion1_code",
            },
            drop=True,
        )

        # Convert date to ISO format
        data["date"] = data["date"].apply(lambda x: str(x)[:10])

        # Correct an error in the age bins from data source
        data.loc[data["age"] == "-1-9", "age"] = "1-9"

        # Parse age to match our group names
        def parse_age_group(age_group: str):
            new_age_group = ""
            age_bins = age_group.strip().replace("+", "-").split("-", 1)
            age_lo = safe_int_cast(age_bins[0])
            new_age_group += f"{age_lo:02d}-"

            if len(age_bins) > 1 and age_bins[1]:
                age_hi = safe_int_cast(age_bins[1])
                new_age_group += f"{age_hi:02d}"

            return new_age_group

        data["age"] = data["age"].apply(parse_age_group)

        # Derive key from the subregion code
        data["key"] = "US_" + data["subregion1_code"]

        # Some of the places are not US states
        data.loc[data["subregion1_code"] == "NYC", "key"] = "US_NY_NYC"

        # Compute our own age groups since they are not uniform across states
        for idx in range(10):
            data[f"age_bin_{idx:02d}"] = None
            data[f"new_deceased_age_{idx:02d}"] = None
            data[f"total_deceased_age_{idx:02d}"] = None
        for key in data["key"].unique():
            mask = data["key"] == key
            age_bins = data.loc[mask, "age"].unique()
            sorted_age_bins = sorted(age_bins, key=lambda x: safe_int_cast(x.split("-")[0]))
            for idx, age_bin_val in enumerate(sorted_age_bins):
                data.loc[mask, f"age_bin_{idx:02d}"] = age_bin_val
                age_bin_mask = mask & (data["age"] == age_bin_val)
                data.loc[age_bin_mask, f"new_deceased_age_{idx:02d}"] = data.loc[
                    age_bin_mask, "new_deceased"
                ]
                data.loc[age_bin_mask, f"total_deceased_age_{idx:02d}"] = data.loc[
                    age_bin_mask, "total_deceased"
                ]

        # Output the results
        return data.drop(columns=["age", "subregion1_code"])
예제 #20
0
 def fix_temp(value: int):
     value = safe_int_cast(value)
     return None if value is None else "%.1f" % (value / 10.0)
예제 #21
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        cases = table_rename(
            dataframes[0],
            {
                "sexoPaciente": "sex",
                "idadePaciente": "age",
                "codigoMunicipioPaciente": "subregion2_code",
                "dataResultadoExame": "date_new_tested",
                "dataObito": "date_new_deceased",
                "dataEntradaUtisSvep": "date_new_intensive_care",
                "evolucaoCasoSivep": "_prognosis",
                "dataInicioSintomas": "_date_onset",
                "dataEvolucaoCasoSivep": "_date_update",
                "resultadoFinalExame": "_test_result",
            },
            drop=True,
        )

        # Follow the procedure described in the data documentation to compute the confirmed cases:
        # https://drive.google.com/file/d/1DUwST2zcXUnCJmJauiM5zmpSVWqLiAYI/view
        cases["date_new_confirmed"] = None
        confirmed_mask = cases["_test_result"] == "Positivo"
        cases.loc[confirmed_mask,
                  "date_new_confirmed"] = cases.loc[confirmed_mask,
                                                    "date_new_tested"]

        # Only count intensive care patients if they had a positive test result
        cases.loc[~confirmed_mask, "date_new_intensive_care"] = None

        # Drop columns which we have no use for
        cases = cases[[
            col for col in cases.columns if not col.startswith("_")
        ]]

        # Make sure our region code is of type str
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            lambda x: None if isna(x) else str(safe_int_cast(x)))

        # Convert ages to int, and translate sex (no "other" sex/gender reported)
        cases["age"] = cases["age"].apply(safe_int_cast)
        cases["sex"] = cases["sex"].apply({
            "MASCULINO": "male",
            "FEMENINO": "female"
        }.get)

        # Convert to time series format
        data = convert_cases_to_time_series(cases,
                                            index_columns=["subregion2_code"])

        # Convert date to ISO format
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))

        # Aggregate state-level data by adding all municipalities
        state = data.drop(columns=["subregion2_code"]).groupby(
            ["date", "age", "sex"]).sum()
        state.reset_index(inplace=True)
        state["key"] = "BR_CE"

        # Fortaleza is both a subregion of the state and a "locality"
        city = data.loc[data["subregion2_code"] == "230440"].copy()
        city["key"] = "BR_CE_FOR"

        # Drop bogus records from the data
        data = data[~data["subregion2_code"].isna()
                    & (data["subregion2_code"] != "")]

        # We can build the key for the data directly from the subregion code
        data["key"] = "BR_CE_" + data["subregion2_code"]

        return concat([state, data, city])
예제 #22
0
def _age_adapter(age: str) -> str:
    try:
        age = safe_int_cast(str(age).replace("+", "-").split("-")[0])
        return age_group(age, bin_size=10, age_cutoff=70)
    except:
        return "age_unknown"
예제 #23
0
    def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        if parse_opts.get("debug"):
            print("File name:", sources[0])

        # Get the file contents from source
        html_content = open(sources[0]).read()

        # We need to set locale in order to parse dates properly
        locale.setlocale(locale.LC_TIME, parse_opts.get("locale", "en_US") + ".UTF-8")

        # Tables keep changing order, so iterate through all until one looks good
        table_count = count_html_tables(html_content, selector="table.wikitable")

        data: DataFrame = None
        for table_index in range(table_count):
            data = read_html(
                html_content,
                header=True,
                selector="table.wikitable",
                parser=wiki_html_cell_parser,
                table_index=table_index,
                skiprows=parse_opts.get("skiprows", 0),
            )

            if parse_opts.get("debug"):
                print("\n[%d] Data:" % (table_index + 1))
                print(data.columns)
                print(data.head(50))

            # Some of the tables are in Spanish
            data = data.rename(columns={"Fecha": "Date"})

            # Set first date column as index, drop other date columns
            columns_lowercase = [(col or "").lower() for col in data.columns]
            date_index = columns_lowercase.index("date") if "date" in columns_lowercase else 0
            del_index = [i for i, col in enumerate(columns_lowercase) if col == "date"][1:]
            data = data.iloc[:, [i for i, _ in enumerate(data.columns) if i not in del_index]]
            data = data.set_index(data.columns[date_index])
            # data = data.iloc[:, :-parse_opts.get('skipcols', 0)]
            if parse_opts.get("droprows") is not None:
                try:
                    data = data.drop(parse_opts["droprows"].split(","))
                except:
                    pass

            # Pivot the table to fit our preferred format
            data = pivot_table(data, pivot_name="subregion")
            data = data[~data["subregion"].isna()]

            if parse_opts.get("debug"):
                print("\n[%d] Pivoted:" % (table_index + 1))
                print(data.head(50))

            # Make sure all dates include year
            date_format = parse_opts["date_format"]
            if "%Y" not in date_format:
                date_format = date_format + "-%Y"
                data["date"] = data["date"].astype(str) + "-%d" % datetime.datetime.now().year

            # Parse into datetime object, drop if not possible
            data["date"] = data["date"].apply(lambda date: safe_datetime_parse(date, date_format))
            data = data[~data["date"].isna()]

            # If the dataframe is not empty, then we found a good one
            if len(data) > 10 and len(data["subregion"].unique()) > 3:
                break

        # Make sure we have *some* data
        assert data is not None and len(data) > 0

        # Convert all dates to ISO format
        data["date"] = data["date"].apply(lambda date: date.date().isoformat())

        # Get the confirmed and deaths data from the table
        data["confirmed"] = data["value"].apply(lambda x: safe_int_cast(self._parenthesis(x)[0]))
        data["deceased"] = data["value"].apply(lambda x: safe_int_cast(self._parenthesis(x)[1]))

        # Add up all the rows with same Date and subregion
        data = data.sort_values(["date", "subregion"])
        data = (
            data.drop(columns=["value"])
            .groupby(["subregion", "date"])
            .agg(self._aggregate_region_values)
        )
        data = data.reset_index().sort_values(["date", "subregion"])

        # Add the appropriate columns to the data
        value_columns = ["confirmed", "deceased"]
        for col in value_columns:
            data["new_" + col] = None
            data["total_" + col] = None

        # Iterate over the individual subregions to process the values per group
        for region in data["subregion"].unique():
            mask = data["subregion"] == region

            for column in value_columns:

                # We can forward-fill values if data is cumsum
                if parse_opts.get("cumsum"):
                    data.loc[mask, column] = data.loc[mask, column].ffill()

                # Fill NA with zero to allow for column-wide operations
                zero_filled = data.loc[mask, column].fillna(0)

                # Only perform operation if the column is not all NaN
                if sum(zero_filled) > 0:
                    # Compute diff of the values region by region if required
                    if parse_opts.get("cumsum"):
                        data.loc[mask, "new_" + column] = zero_filled.diff()
                        data.loc[mask, "total_" + column] = zero_filled
                    # If values are already new daily counts, then empty most likely means zero
                    else:
                        data.loc[mask, "new_" + column] = zero_filled
                        data.loc[mask, "total_" + column] = zero_filled.cumsum()

        # Get rid of rows which have all null values
        data = data.dropna(how="all", subset=value_columns)

        # Add the country code to all records
        data["country_code"] = parse_opts["country"]

        # Labels can be any arbitrary column name
        data = data.rename(columns={"subregion": "match_string"})

        # Drop columns if requested
        if "drop_column" in parse_opts:
            data = data.drop(columns=[parse_opts["drop_column"]])

        # Filter out the appropriate levels
        aggregation_level = parse_opts.get("aggregation_level", 1)
        if aggregation_level == 1:
            null_column = "subregion2_code"
        elif aggregation_level == 2:
            null_column = "subregion1_code"
        data[null_column] = None

        # Remove known values that are just noise
        data["_match_string"] = data["match_string"].apply(lambda x: x.lower())
        data = data[
            ~data["_match_string"].isin(
                [
                    "cml",
                    "new",
                    "total",
                    "tests",
                    "deaths",
                    "abroad",
                    "airport",
                    "current",
                    "newcases",
                    "acumulado",
                    "repatriated",
                    "totaltested",
                    "confirmed cases",
                    "unassigned\ncases",
                    "airport screening",
                ]
            )
        ]
        data = data[~data["_match_string"].apply(lambda x: "princess" in x or "total" in x)]

        # Output the results
        if parse_opts.get("debug"):
            print("\nOutput:")
            print(data.head(50))

        return data
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Read data from GitHub repo
        data = {}
        for name, df in dataframes.items():
            df.rename(columns={"ISO 3166-2 Code": "key"}, inplace=True)
            df.key = df.key.str.replace("-", "_")
            data[name] = df.set_index("key")
        value_columns = [f"total_{statistic}" for statistic in data.keys()]

        # Transform the data from non-tabulated format to record format
        records = []
        for key in data["confirmed"].index.unique():
            date_columns = [
                col for col in data["confirmed"].columns
                if re.match(r"\d\d\d\d-\d\d-\d\d", col)
            ]
            for col in date_columns:
                records.append({
                    **{
                        "date": col,
                        "key": key
                    },
                    **{
                        f"total_{statistic}": safe_int_cast(df.loc[key, col])
                        for statistic, df in data.items()
                    },
                })

        # Zeroes can be considered NaN in this data
        data = DataFrame.from_records(records).replace(0, None)

        # Remove all data without a proper date
        data = data.dropna(subset=["date"])

        # Since it's cumsum data, we can forward fill relatively safely
        data = data.sort_values(["key", "date"
                                 ]).groupby("key").apply(lambda x: x.ffill())

        # We already have AR, BR, CO, MX and PE data directly from the authoritative source
        for country_code in ("AR", "BR", "CL", "CO", "MX", "PE"):
            data = data[~data.key.str.startswith(f"{country_code}_")]

        # Correct the French colonies since we are using something different to the ISO 3166-2 code
        data.loc[data.key == "FR_GP", "key"] = "FR_GUA"

        # VE_W are "external territories" of Venezuela, which can't be mapped to any particular
        # geographical region and therefore we exclude them
        data = data[data.key != "VE_W"]

        # The data appears to be very unreliable prior to March 12
        data.loc[data["date"] <= "2020-03-12", value_columns] = None

        # In many cases, total counts go down dramatically; filter out all dates prior to that
        # We lose a lot of data by setting such a low threshold, but this data source is unreliable
        skip_threshold = 0
        for key in data["key"].unique():
            rm_date = "2020-01-01"
            subset = data[data["key"] == key]
            diffs = subset.set_index("date")[value_columns].diff()
            for col in value_columns:
                max_date = diffs.loc[diffs[col] < skip_threshold].index.max()
                if not isna(max_date) and max_date > rm_date:
                    rm_date = max_date
            data.loc[(data["key"] == key) & (data["date"] < rm_date),
                     value_columns] = None

        return data