Пример #1
0
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        data = dataframes[0]

        data["date"] = data.REPORT_DATE.apply(lambda x: datetime_isoformat(x, "%Y-%m-%d"))
        # Add level1 keys
        subregion1s = country_subregion1s(aux["metadata"], "AU")
        data = table_merge([data, subregion1s], left_on="CODE", right_on="subregion1_code", how="left")
        # Country-level record has CODE AUS
        country_mask = data["CODE"] == "AUS"
        data.loc[country_mask, "key"] = "AU"
        # Only keep country and subregion1 rows
        data = data[data.key != None]
        data = table_rename(
            data,
            {
                "date": "date",
                "key": "key",
                "VACC_DOSE_CNT": "total_vaccine_doses_administered",
                "VACC_PEOPLE_CNT": "total_persons_fully_vaccinated",
            },
            drop=True)
        # remove rows without vaccination data
        data.dropna(subset=["total_vaccine_doses_administered", "total_persons_fully_vaccinated"], how="all", inplace=True)
        # based on the assumption two doses = fully vaccinated(since Australia is using Pfizer and AZ)
        data["total_persons_vaccinated"] = estimate_total_persons_vaccinated(data)

        return data
Пример #2
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        with open(sources[0], "r") as fd:
            features = json.load(fd)["features"]

        records = {"hospitalized": [], "intensive_care": [], "ventilator": []}
        for record in features:

            if record["SERIE"] == "HPT":
                statistic = "hospitalized"
            elif record["SERIE"] == "CSR":
                statistic = "intensive_care"
            elif record["SERIE"] == "CCR":
                statistic = "ventilator"
            else:
                self.log_error(f"Unknown statistic type: {statistic}")
                continue
            records[statistic].append(
                {
                    "date": datetime.fromtimestamp(record["FECHA"] / 1000).date().isoformat(),
                    f"current_{statistic}": record["CV19"],
                }
            )

        dataframes = []
        for df in records.values():
            dataframes.append(DataFrame.from_records(df).groupby("date").sum().reset_index())

        data = table_merge(dataframes, how="outer")
        data["key"] = "ES_CN"
        return data
Пример #3
0
    def parse_dataframes(
        self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:
        tables = [df for name, df in dataframes.items() if name != "geo"]
        column_adapter = dict(_column_adapter, state="idxs", date=f"date_new_confirmed")
        data = table_rename(concat(tables), column_adapter=column_adapter, drop=True)

        # Correct data types where necessary
        data["idxs"] = data["idxs"].astype(str)
        data["age"] = data["age"].apply(lambda x: None if x < 0 else x)
        data["sex"] = data["sex"].apply({0: "female", 1: "male"}.get)

        # Convert to our preferred time series format
        data = convert_cases_to_time_series(data, ["idxs"])

        # Geo name lookup
        geo_col_adapter = {"state": "subregion1_name", "district": "subregion2_name"}
        geo = table_rename(dataframes["geo"], geo_col_adapter, drop=False)
        geo["idxs"] = geo["idxs"].astype(str)
        geo["subregion1_name"] = geo["subregion1_name"].str.replace("W.P. ", "")
        geo = geo.groupby(["subregion1_name", "idxs"]).first().reset_index()
        data = table_merge([data, geo], on=["idxs"], how="inner")

        # Since only the cases have district level data, ignore it
        data["country_code"] = "MY"
        data["subregion2_name"] = None
        return data
Пример #4
0
    def parse_dataframes(
        self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:
        # Convert the raw data into numeric values
        for df in dataframes.values():
            df["entries"] = df["entries"].apply(safe_int_cast)

        data = table_merge(
            [
                table_rename(df, dict(_column_adapter, entries=name), drop=True)
                for name, df in dataframes.items()
            ],
            on=["date", "subregion1_code"],
            how="outer",
        )

        # Make sure all records have the country code and match subregion1 only
        data["key"] = None
        data["country_code"] = "CH"
        data["subregion2_code"] = None
        data["locality_code"] = None

        # Country-level records have a known key
        country_mask = data["subregion1_code"] == "CH"
        data.loc[country_mask, "key"] = "CH"

        # Principality of Liechtenstein is not in CH but is in the data as FL
        country_mask = data["subregion1_code"] == "FL"
        data.loc[country_mask, "key"] = "LI"

        # Output the results
        return data
Пример #5
0
    def parse_dataframes(self, dataframes: Dict[Any, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        data = table_rename(
            dataframes["fullyVaccPersons"],
            {
                "date": "date",
                "geoRegion": "subregion1_code",
                "type": "_statistic",
                "entries": "_new_count",
                "sumTotal": "_total_count",
            },
            drop=True,
        )

        # Combine all the different variable indicators
        tables = []
        for col, var in {
                "COVID19AtLeastOneDosePersons": "persons_vaccinated",
                "COVID19FullyVaccPersons": "persons_fully_vaccinated",
        }.items():
            adapter = {
                "_new_count": f"new_{var}",
                "_total_count": f"total_{var}"
            }
            subset = data.loc[data["_statistic"] == col].drop(
                columns=["_statistic"])
            tables.append(subset.rename(columns=adapter))
        data = table_merge(tables, on=["date", "subregion1_code"], how="outer")

        # Output the results
        return _output_ch_data(data)
Пример #6
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        data = table_merge([
            melt(dataframes[name],
                 id_vars=["Date"],
                 var_name="match_string",
                 value_name=value)
            for name, value in [(
                "confirmed", "new_confirmed"), ("deceased", "total_deceased")]
        ])

        data["country_code"] = "JP"

        # Get date in ISO format
        data = data.rename(columns={"Date": "date"})
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y/%m/%d"))

        # Country-level uses the label "ALL"
        country_mask = data["match_string"] == "ALL"
        country = data.loc[country_mask]
        data = data.loc[~country_mask]
        country["key"] = "JP"

        # Output the results
        return concat([country, data])
Пример #7
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = table_merge(
            [
                dataframes["confirmed_deceased_recovered"].rename(
                    columns=COMMON_COLUMNS, ),
                dataframes["tested"].rename(columns={
                    "TestGesamt": "total_tested",
                    "MeldeDatum": "Time"
                })
            ],
            how="outer",
        )

        # Convert date to ISO format
        data["date"] = data["Time"].apply(
            lambda x: datetime_isoformat(x, "%d.%m.%Y %H:%M:%S"))

        # Create the key from the state ID
        data["key"] = data["BundeslandID"].apply(lambda x: f"AT_{x}")

        data.loc[data["key"] == "AT_10", "key"] = "AT"

        # Output the results
        return data
Пример #8
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:
        with open(sources[0], "r") as fd:
            features = json.load(fd)["features"]

        records = {"confirmed": [], "deceased": [], "recovered": []}
        for record in features:
            if record["TIPO"] == "Casos":
                statistic = "confirmed"
            elif record["TIPO"] == "Fallecidos":
                statistic = "deceased"
            elif record["TIPO"] == "Recuperados":
                statistic = "recovered"
            else:
                self.log_error(f"Unknown statistic type: {statistic}")
                continue
            records[statistic].append({
                "date":
                datetime.fromtimestamp(record["FECHA"] /
                                       1000).date().isoformat(),
                "subregion2_code":
                record["CODMUN"],
                "subregion2_name":
                record["MUNICIPIO"],
                f"new_{statistic}":
                record["CV19_DIA"],
                f"total_{statistic}":
                record["CV19_AC"],
                "_island":
                record["ISLA"],
            })

        dataframes = [DataFrame.from_records(df) for df in records.values()]
        data = table_merge(dataframes, how="outer")
        data["key"] = "ES_CN_" + data["subregion2_code"].astype(str)

        # Add the country and region code to all records
        data["country_code"] = "ES"
        data["subregion1_code"] = "CN"

        # Aggregate by island and map to known key
        islands = (data.drop(
            columns=["key", "subregion2_code", "subregion2_name"]).groupby(
                ["date", "_island"]).sum().reset_index())
        islands["key"] = "ES_CN_" + islands["_island"].apply(_island_map.get)

        # Aggregate the entire autonomous community
        l1 = islands.drop(
            columns=["key", "_island"]).groupby("date").sum().reset_index()
        l1["key"] = "ES_CN"

        # Drop bogus values
        data = data[data["subregion2_code"] != 0]
        islands = islands[~islands["key"].isna()]

        return concat([data, islands, l1])
Пример #9
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = table_merge(
            [
                table_rename(
                    dataframes["confirmed"],
                    {
                        "Fecha": "date",
                        "Total": "new_confirmed",
                        "Region": "match_string"
                    },
                    drop=True,
                ),
                table_rename(
                    dataframes["deceased"],
                    # The file name indicates the counts are cumulative, but they are not
                    {
                        "Fecha": "date",
                        "Total": "total_deceased",
                        "Region": "match_string"
                    },
                    drop=True,
                ),
                table_rename(
                    dataframes["tested"],
                    {
                        "Fecha": "date",
                        "numero": "new_tested",
                        "Region": "match_string"
                    },
                    drop=True,
                ),
            ],
            how="outer",
        )

        # Convert date to ISO format
        data["date"] = data["date"].astype(str)

        # Extract cities from the regions
        city = _extract_cities(data)

        # Make sure all records have country code and no subregion code or key
        data["country_code"] = "CL"
        data["key"] = None
        data["subregion2_code"] = None

        # Country is reported as "Total"
        data.loc[data["match_string"] == "Total", "key"] = "CL"

        # Drop bogus records from the data
        data.dropna(subset=["date", "match_string"], inplace=True)

        return concat([data, city])
Пример #10
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data_list = []
        for statistic, source_file in sources.items():
            with open(source_file, "r") as fd:
                df = DataFrame.from_records(json.load(fd)["values"])
            data_list.append(table_rename(df, {"value": statistic}))

        data = table_merge(data_list, how="outer")
        data["key"] = "RO"
        return data
Пример #11
0
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        data = table_merge(
            [
                table_rename(
                    dataframes["confirmed"],
                    {
                        "Fecha": "date",
                        "Casos confirmados": "total_confirmed",
                        "Codigo region": "subregion1_code",
                        "Codigo comuna": "subregion2_code",
                    },
                    drop=True,
                ),
                table_rename(
                    dataframes["deceased"],
                    {
                        "Fecha": "date",
                        "Casos fallecidos": "total_deceased",
                        "Codigo region": "subregion1_code",
                        "Codigo comuna": "subregion2_code",
                    },
                    drop=True,
                ),
            ],
            how="outer",
        )

        # Convert date to ISO format
        data["date"] = data["date"].astype(str)

        # Parse region codes as strings
        data["subregion1_code"] = data["subregion1_code"].apply(
            lambda x: numeric_code_as_string(x, 2)
        )
        data["subregion2_code"] = data["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 5)
        )

        # Use proper ISO codes for the subregion1 level
        data["subregion1_code"] = data["subregion1_code"].apply(_SUBREGION1_CODE_MAP.get)

        # Extract cities from the municipalities
        city = _extract_cities(data)

        # We can build the key for the data directly from the subregion codes
        data["key"] = "CL_" + data["subregion1_code"] + "_" + data["subregion2_code"]

        # Drop bogus records from the data
        data.dropna(subset=["subregion1_code", "subregion2_code"], inplace=True)

        return concat([data, city])
Пример #12
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts):
        # Read all files in the eurostat folder and merge them together
        eurostat_directory = SRC / "data" / "eurostat"
        dataframes = [
            read_file(file_name)
            for file_name in eurostat_directory.glob("*.csv")
        ]
        data = table_merge(dataframes, how="outer").dropna(subset=["key"])

        # Use only keys available in metadata
        return data.merge(aux["metadata"][["key"]], how="inner")
Пример #13
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = table_rename(dataframes[0], _column_adapter, drop=True)
        data.date = data.date.apply(
            lambda x: datetime_isoformat(x, "%d/%m/%Y"))
        # add location keys
        subregion1s = country_subregion1s(aux["metadata"], "IN")
        data = table_merge([data, subregion1s[["key", "subregion1_name"]]],
                           on=["subregion1_name"],
                           how="inner")
        return data
Пример #14
0
def _get_data(url_tpl: str, subregion_code_col: str, subregion_code_to_api_id_map: Dict[str, int],
              subregions: DataFrame) -> DataFrame:
    subregion_codes = subregions[subregion_code_col].values
    map_func = partial(_get_records, url_tpl, subregion_code_to_api_id_map)
    data = DataFrame.from_records(sum(thread_map(map_func, subregion_codes), []))
    data['date'] = data.apply(lambda r: _indonesian_date_to_isoformat(r.tgl), axis=1)
    # add location keys
    data = table_merge(
        [data, subregions],
        left_on="subregion_code", right_on=subregion_code_col, how="left")
    data = table_rename(data, _col_name_map, drop=True)
    return data
Пример #15
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Keep only columns we can process
        data = table_merge(
            [_parse_pivot(df, name) for name, df in dataframes.items()],
            how="outer")
        data = data[[
            "date", "country_code", "match_string", "new_confirmed",
            "new_deceased"
        ]]
        return data.fillna(0)
Пример #16
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        tables = [
            table_rename(table, _column_adapter, drop=True)
            for table in dataframes.values()
        ]
        data = table_merge(tables, on="date", how="outer")

        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y/%m/%d"))
        data["key"] = "US_CA_SFO"
        return data
Пример #17
0
    def parse_dataframes(
        self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:
        data = table_merge(
            [
                table_rename(
                    dataframes['vaccDosesAdministered'],
                    {
                        "date": "date",
                        "geoRegion": "subregion1_code",
                        "sumTotal": "total_vaccine_doses_administered",
                    },
                    drop=True,
                ),
                table_rename(
                    dataframes['fullyVaccPersons'],
                    {
                        "date": "date",
                        "geoRegion": "subregion1_code",
                        "sumTotal": "total_persons_fully_vaccinated",
                    },
                    drop=True,
                ),
            ],
            on=["date", "subregion1_code"],
            how="outer",
        )

        # Assuming fully and partially vaccinated persons have 2 and 1 doses respectively,
        # total_persons_partially_vaccinated = total_vaccine_doses_administered - 2 * total_persons_fully_vaccinated
        # Therefore, total_persons_vaccinated = total_persons_partially_vaccinated + total_persons_fully_vaccinated
        # = total_vaccine_doses_administered - total_persons_fully_vaccinated
        data["total_persons_vaccinated"] = data["total_vaccine_doses_administered"] - data["total_persons_fully_vaccinated"]

        # Make sure all records have the country code and match subregion1 only
        data["key"] = None
        data["country_code"] = "CH"
        data["subregion2_code"] = None
        data["locality_code"] = None

        # Country-level records have a known key
        country_mask = data["subregion1_code"] == "CH"
        data.loc[country_mask, "key"] = "CH"

        # Principality of Liechtenstein is not in CH but is in the data as FL
        country_mask = data["subregion1_code"] == "FL"
        data.loc[country_mask, "key"] = "LI"

        # Output the results
        return data
Пример #18
0
def _process_inputs(dataframes: Dict[Any, DataFrame]) -> DataFrame:
    # Combine all tables
    data = table_merge(dataframes.values(), how="outer")
    data = table_rename(data, _column_adapter, drop=True)
    data["country_code"] = "MY"

    # Remove records with no date
    data = data.dropna(subset=["date"])

    # Fix the subregion names to match our index
    if "subregion1_name" in data.columns:
        data["subregion1_name"] = data["subregion1_name"].str.replace("W.P. ", "")

    # Add up different categories
    if "new_tested_1" in data.columns and "new_tested_2" in data.columns:
        data["new_tested"] = data["new_tested_1"] + data["new_tested_2"]

    return data
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = dataframes[0]
        # Flatten the table
        data = melt(data,
                    id_vars=["State"],
                    var_name="date",
                    value_name='total_vaccine_doses_administered')
        data.date = data.date.apply(
            lambda x: datetime_isoformat(x, "%d/%m/%Y"))
        # add location keys
        subregion1s = country_subregion1s(aux["metadata"], "IN")
        data = table_merge([data, subregion1s[['key', 'subregion1_name']]],
                           left_on="State",
                           right_on='subregion1_name',
                           how="inner")
        return data
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        tables = []
        rename_opts = dict(drop=True, remove_regex=r"[^0-9a-z\s]")
        name_map = {
            "Cases": "confirmed",
            "Deaths": "deceased",
            "Hospitalizations": "hospitalized"
        }
        for sheet_name, stat_name in name_map.items():
            col_name = f"_{stat_name}_"
            col_adapter = {
                k: v.replace("_stat_", col_name)
                for k, v in _col_adapter_base.items()
            }
            table = table_rename(dataframes[0][sheet_name], col_adapter,
                                 **rename_opts)
            table["date"] = table["date"].apply(lambda x: str(x)[:10])
            tables.append(table)

        data = table_merge(tables, how="outer", on=["date", "subregion2_name"])
        state = data.drop(columns=["subregion2_name"]).groupby(
            ["date"]).sum().reset_index()
        state["key"] = "US_WA"

        data = data[data["subregion2_name"] != "Unassigned"]
        data["country_code"] = "US"
        data["subregion1_code"] = "WA"

        for df in (state, data):
            df["age_bin_00"] = "0-11"
            df["age_bin_01"] = "12-19"
            df["age_bin_02"] = "20-34"
            df["age_bin_03"] = "35-49"
            df["age_bin_04"] = "50-64"
            df["age_bin_05"] = "65-79"
            df["age_bin_06"] = "80-"

        return concat([state, data])
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = table_merge(
            [
                table_rename(
                    dataframes["confirmed"], _column_adapter, drop=True),
                table_rename(
                    dataframes["deceased"], _column_adapter, drop=True),
            ],
            how="outer",
        )

        # Province names are sometimes codes (but not always compliant with ISO codes)
        data["subregion1_code"] = data["subregion1_name"].apply(
            _province_map.get)
        data.drop(columns=["subregion1_name"], inplace=True)

        # Convert date to ISO format
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%d-%m-%Y"))

        # Aggregate subregion1 level
        l1_index = ["date", "subregion1_code"]
        l1 = data.drop(
            columns=["match_string"]).groupby(l1_index).sum().reset_index()

        # Make sure all records have the country code and subregion2_name
        l1["country_code"] = "CA"
        l1["subregion2_name"] = None
        data["country_code"] = "CA"
        data["subregion2_name"] = ""

        # Remove bogus data
        data = data[data["match_string"] != "Not Reported"]

        # Output the results
        return concat([l1, data])
Пример #22
0
    def parse_dataframes(self, dataframes: Dict[Any, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        data = table_rename(dataframes[0], _column_adapter, drop=True)

        # Convert date to ISO format
        data["date"] = data["year"].apply(
            lambda x: datetime.datetime.strptime(str(x), "%Y"))
        data["date"] = data["date"] + data["week"].apply(
            lambda x: datetime.timedelta(weeks=x))
        data["date"] = data["date"].apply(lambda x: x.date().isoformat())
        data = data.drop(columns=["week", "year"])

        # Process 1-dose and 2-dose separately
        data_1_dose = data[data["_dose_type"].str.slice(-1) == "1"].drop(
            columns=["_dose_type"])
        data_2_dose = data[data["_dose_type"].str.slice(-1) == "2"].drop(
            columns=["_dose_type"])
        data_1_dose = data_1_dose.rename(
            columns={"_total_doses": "total_persons_vaccinated"})
        data_2_dose = data_2_dose.rename(
            columns={"_total_doses": "total_persons_fully_vaccinated"})
        data = table_merge([data_1_dose, data_2_dose], how="outer")

        # Make sure only subregion1 matches
        data["key"] = None
        data["country_code"] = "SE"
        data["subregion2_code"] = None
        data["locality_code"] = None

        # Country totals are reported using a special name
        data.loc[data["match_string"] == "| Sverige |", "key"] = "SE"

        # Estimate the total doses from person counts
        data["total_vaccine_doses_administered"] = (
            data["total_persons_vaccinated"] +
            data["total_persons_fully_vaccinated"])

        return data
Пример #23
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        column_adapter = {
            "Requiring inpatient care": "new_hospitalized",
            "Discharged from hospital or released from treatment":
            "new_recovered",
        }

        tables = []
        for col_prev, col_value in column_adapter.items():
            keep_cols = [
                col for col in dataframes[0].columns if col_prev in col
            ]
            df = dataframes[0][["Date"] + keep_cols]
            df = melt(df,
                      id_vars=["Date"],
                      var_name="match_string",
                      value_name=col_value)
            df["match_string"] = df["match_string"].apply(
                lambda x: x.split(" ")[0][1:-1])
            tables.append(df)

        data = table_merge(tables)
        data["country_code"] = "JP"

        # Get date in ISO format
        data = data.rename(columns={"Date": "date"})
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y/%m/%d"))

        # Country-level uses the label "ALL"
        country_mask = data["match_string"] == "ALL"
        country = data.loc[country_mask]
        data = data.loc[~country_mask]
        country["key"] = "JP"

        # Output the results
        return concat([country, data])
Пример #24
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = table_merge(
            [
                Covid19ZaCumulativeDataSource._parse_variable(df, name)
                for name, df in dataframes.items()
            ],
            how="outer",
        )

        # Convert date to ISO format
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%d-%m-%Y"))

        # Country-level records should have "total" region name
        country_mask = data["subregion1_code"] == "total"
        data.loc[country_mask, "key"] = "ZA"

        # All other records can provide their own key directly
        data.loc[~country_mask, "key"] = "ZA_" + data.subregion1_code

        # Output the results
        return data
Пример #25
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        sheets = []
        sheet_processors = {
            "Trends": TexasDataSource._parse_trends,
            "Tests by Day": TexasDataSource._parse_tests,
            "Hospitalization by Day": TexasDataSource._parse_hospitalized,
        }
        for sheet_name, sheet_processor in sheet_processors.items():
            df = sheet_processor(read_file(sources[0], sheet_name=sheet_name))
            df["date"] = df["date"].apply(safe_str_cast)
            df["date"] = df["date"].apply(
                lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))
            df = df.dropna(subset=["date"])
            sheets.append(df)

        data = table_merge(sheets, how="outer")
        for col in data.columns:
            if col != "date":
                data[col] = data[col].apply(safe_float_cast).astype(float)

        data["key"] = "US_TX"
        return data
Пример #26
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        cases_confirmed = table_rename(
            dataframes["confirmed"], _column_adapter,
            drop=True).rename(columns={"date": "date_new_confirmed"})
        cases_deceased = table_rename(
            dataframes["deceased"], _column_adapter,
            drop=True).rename(columns={"date": "date_new_deceased"})

        # Translate sex label
        for df in (cases_confirmed, cases_deceased):
            df["sex"] = df["sex"].apply({
                "MASCULINO": "male",
                "FEMENINO": "female"
            }.get)

        # Convert to time series
        index_columns = ["subregion1_name", "province_name", "subregion2_name"]
        data_confirmed = convert_cases_to_time_series(cases_confirmed,
                                                      index_columns)
        data_deceased = convert_cases_to_time_series(cases_deceased,
                                                     index_columns)

        # Join into a single dataset
        data = table_merge([data_confirmed, data_deceased], how="outer")

        # Remove bogus records
        data.dropna(subset=["date"], inplace=True)

        # Set country code and get date in ISO format
        data["country_code"] = "PE"
        data["date"] = data["date"].apply(safe_int_cast)
        data["date"] = data["date"].apply(safe_str_cast)
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y%m%d"))

        # Properly capitalize department to allow for exact matching
        data["subregion1_name"] = data["subregion1_name"].apply(
            lambda x: _department_map.get(x, x.title()))

        # Lima region and lima department are mixed in data, we can distinguish based on province
        # Sometimes region is something different, so for Lima province we only need `province_name`
        lima_region_mask = data["subregion1_name"].str.lower() == "lima"
        lima_province_mask = data["province_name"].str.lower() == "lima"
        data.loc[lima_province_mask,
                 "subregion1_name"] = "Metropolitan Municipality of Lima"
        data.loc[lima_region_mask & ~lima_province_mask,
                 "subregion1_name"] = "Lima Region"

        # Aggregate by admin level 1
        subregion1 = (data.drop(
            columns=["subregion2_name", "province_name"]).groupby(
                ["date", "country_code", "subregion1_name", "age",
                 "sex"]).sum().reset_index())
        subregion1["subregion2_name"] = None

        # Try to match based on subregion2_name using fuzzy matching, and set subregion2_name to
        # an empty string to turn off exact matching
        data = data.rename(columns={"subregion2_name": "match_string"})
        data["subregion2_name"] = ""

        # Convert other text fields to lowercase for consistent processing
        data["match_string"] = data["match_string"].apply(fuzzy_text)
        data["province_name"] = data["province_name"].apply(fuzzy_text)

        # Drop bogus records
        data = data[~data["match_string"].isna()]
        data = data[~data["match_string"].
                    isin(["", "eninvestigacion", "extranjero"])]

        # Because we are skipping provinces and going directly from region to district, there are
        # some name collisions which we have to disambiguate manually
        for province1, province2, district in [
            ("lima", "canete", "sanluis"),
            ("lima", "yauyos", "miraflores"),
            ("ica", "chincha", "pueblonuevo"),
            ("canete", "huarochiri", "sanantonio"),
            ("bolognesi", "huaylas", "huallanca"),
            ("lucanas", "huancasancos", "sancos"),
            ("santacruz", "cutervo", "santacruz"),
            ("yauli", "jauja", "yauli"),
            ("yauli", "jauja", "paccha"),
            ("huarochiri", "yauyos", "laraos"),
            ("elcollao", "melgar", "santarosa"),
        ]:
            for province in (province1, province2):
                mask = (data["province_name"]
                        == province) & (data["match_string"] == district)
                data.loc[mask, "match_string"] = f"{district}, {province}"

        # Output the results
        return concat([subregion1, data])