Python grouped_cumsum示例，lib.utils.grouped_cumsum Python示例

示例#1

0

显示文件

文件： jp_2019_ncov_japan.py 项目： zhanghegui/data

    def parse_dataframes(
        self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        data = dataframes[0].rename(
            columns={
                "日付": "date",
                "都道府県名": "match_string",
                "患者数": "confirmed",
                "入院中": "hospitalized",
                "退院者": "recovered",
                "死亡者": "deceased",
            }
        )

        # Convert date to ISO format
        data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y%m%d"))

        # Add the country code to all records
        data["country_code"] = "JP"

        # Keep only columns we can process
        data = data[["date", "match_string", "confirmed", "hospitalized", "recovered", "deceased"]]

        # Aggregate the region-level data
        data = grouped_cumsum(data, ["country_code", "match_string", "date"])

        # Aggregate the country-level data
        data_country = data.groupby("date").sum().reset_index()
        data_country["key"] = "JP"

        # Output the results
        return concat([data_country, data])

示例#2

0

显示文件

    def _parse(file_path: str, sheet_name: str, value_name: str):
        data = read_file(file_path, sheet_name=sheet_name)
        data.columns = [
            col.replace("NHS ", "").replace(" total", "")
            for col in data.iloc[1]
        ]
        data = data.iloc[2:].rename(columns={"Date": "date"})

        data = pivot_table(data.set_index("date"), pivot_name="match_string")
        data = data.rename(columns={"value": value_name})
        data[value_name] = data[value_name].replace(
            "*", None).apply(safe_float_cast).astype(float)

        # Get date in ISO format
        data.date = data.date.apply(lambda x: x.date().isoformat())

        # Compute cumsum of values
        data = grouped_cumsum(data, ["match_string", "date"])

        # Add metadata
        data["key"] = None
        data["country_code"] = "GB"
        data["subregion1_code"] = "SCT"
        l2_mask = data.match_string == "Scotland"
        data.loc[l2_mask, "key"] = "GB_SCT"

        return data

示例#3

0

显示文件

    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename appropriate columns
        data = dataframes[0].rename(
            columns={
                "Codigo DIVIPOLA": "subregion2_code",
                "Fecha de muerte": "date_deceased",
                "Fecha diagnostico": "date_confirmed",
                "Fecha recuperado": "date_recovered",
            })

        # Clean up the subregion code
        data.subregion2_code = data.subregion2_code.apply(
            lambda x: "{0:05d}".format(int(x)))

        # Compute the key from the DIVIPOLA code
        data["key"] = ("CO_" + data.subregion2_code.apply(lambda x: x[:2]) +
                       "_" + data.subregion2_code)

        # A few cases are at the l2 level
        data.key = data.key.apply(lambda x: "CO_" + x[-2:]
                                  if x.startswith("CO_00_") else x)

        # Go from individual case records to key-grouped records in a flat table
        merged: DataFrame = None
        for value_column in ("confirmed", "deceased", "recovered"):
            subset = data.rename(
                columns={"date_{}".format(value_column): "date"})[[
                             "key", "date"
                         ]]
            subset = subset[~subset.date.isna()
                            & (subset.date != "-   -")].dropna()
            subset[value_column] = 1
            subset = subset.groupby(["key", "date"]).sum().reset_index()
            if merged is None:
                merged = subset
            else:
                merged = merged.merge(subset, how="outer")

        # Convert date to ISO format
        merged.date = merged.date.apply(safe_datetime_parse)
        merged = merged[~merged.date.isna()]
        merged.date = merged.date.apply(lambda x: x.date().isoformat())
        merged = merged.fillna(0)

        # Compute the daily counts
        data = grouped_cumsum(merged, ["key", "date"])

        # Group by level 2 region, and add the parts
        l2 = data.copy()
        l2["key"] = l2.key.apply(lambda x: "_".join(x.split("_")[:2]))
        l2 = l2.groupby(["key", "date"]).sum().reset_index()

        # Group by country level, and add the parts
        l1 = l2.copy().drop(columns=["key"])
        l1 = l1.groupby("date").sum().reset_index()
        l1["key"] = "CO"

        return data

示例#4

0

显示文件

文件： jp_2019_ncov_japan.py 项目： zhanghegui/data

    def parse_dataframes(
        self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:
        df1 = Jp2019NcovJapanByDate._parse_pivot(dataframes[0], "confirmed")
        df2 = Jp2019NcovJapanByDate._parse_pivot(dataframes[1], "deceased")

        # Keep only columns we can process
        data = merge(df1, df2)
        data = data[["date", "country_code", "match_string", "confirmed", "deceased"]]
        return grouped_cumsum(data, ["country_code", "match_string", "date"])

示例#5

0

显示文件

文件： se_authority.py 项目： zhanghegui/data

    def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = read_file(sources[0], sheet_name="Antal intensivvårdade per dag").rename(
            columns={"Datum_vårdstart": "date", "Antal_intensivvårdade": "intensive_care"}
        )

        # Get date in ISO format
        data["key"] = "SE"
        data.date = data.date.apply(lambda x: datetime_isoformat(x, "%m/%d/%Y"))
        return grouped_cumsum(data, ["key", "date"])

示例#6

0

显示文件

    def parse(self, sources: List[str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        data = read_file(
            sources[0], error_bad_lines=False, encoding="ISO-8859-1", sep=";"
        ).rename(
            columns={
                "Date":
                "date",
                "Nombre de personnes en soins normaux":
                "current_hospitalized",
                "Nombre de personnes en soins intensifs (sans patients du Grand Est)":
                "current_intensive_care",
                "Nombre de décès - cumulé (sans patients du Grand Est)":
                "deceased",
                "Total patients COVID ayant quitté l'hôpital (hospitalisations stationnaires, données brutes)":
                "recovered",
                "Nombre de nouvelles personnes testées COVID+ par jour ":
                "tested",
            })

        # Get date in ISO format
        data.date = data.date.apply(
            lambda x: datetime_isoformat(x, "%d/%m/%Y"))

        # Keep only columns we can provess
        data = data[[
            "date",
            "current_hospitalized",
            "current_intensive_care",
            "deceased",
            "recovered",
            "tested",
        ]]

        # Convert recovered into a number
        data.recovered = data.recovered.apply(
            lambda x: int(x.replace("-", "0")))

        # Compute the daily counts
        data["key"] = "LU"
        data_new = grouped_diff(data[["key", "date", "deceased"]],
                                ["key", "date"])
        data_cum = grouped_cumsum(data[["key", "date", "tested", "recovered"]],
                                  ["key", "date"])
        data_cur = data[[
            "key", "date", "current_hospitalized", "current_intensive_care"
        ]]
        data = data_new.merge(data_cum, how="outer").merge(data_cur,
                                                           how="outer")

        # Output the results
        return data

示例#7

0

显示文件

    def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = read_file(sources[0], sheet_name="Antal per dag region").rename(
            columns={"Statistikdatum": "date"}
        )

        # Get date in ISO format
        data.date = data.date.astype(str)

        # Unpivot the regions which are columns
        data.columns = [col.replace("_", " ") for col in data.columns]
        data = data.drop(columns=["Totalt antal fall"]).set_index("date")
        data = pivot_table(data, pivot_name="match_string")

        data["country_code"] = "SE"
        data = data.rename(columns={"value": "confirmed"})
        return grouped_cumsum(data, ["country_code", "match_string", "date"])

示例#8

0

显示文件

    def parse_dataframes(self, dataframes: List[DataFrame],
                         metadata: Dict[str,
                                        DataFrame], **parse_opts) -> DataFrame:
        data = dataframes[0]
        metadata = metadata["metadata"]

        # Ensure date field is used as a string
        data["dateRep"] = data["dateRep"].astype(str)

        # Convert date to ISO format
        data["date"] = data["dateRep"].apply(
            lambda x: datetime_isoformat(x, "%d/%m/%Y"))

        # Workaround for https://github.com/open-covid-19/data/issues/8
        # ECDC mistakenly labels Greece country code as EL instead of GR
        data["geoId"] = data["geoId"].apply(lambda code: "GR"
                                            if code == "EL" else code)

        # Workaround for https://github.com/open-covid-19/data/issues/13
        # ECDC mistakenly labels Great Britain country code as UK instead of GB
        data["geoId"] = data["geoId"].apply(lambda code: "GB"
                                            if code == "UK" else code)

        # Remove bogus entries (cruiseships, etc.)
        data = data[~data["geoId"].apply(lambda code: len(code) > 2)]

        data = data.rename(columns={
            "geoId": "key",
            "cases": "confirmed",
            "deaths": "deceased"
        })

        # Adjust the date of the records to match local reporting
        data = self._adjust_date(data, metadata)

        # Keep only the columns we can process
        data = data[["date", "key", "confirmed", "deceased"]]

        return grouped_cumsum(data, ["key", "date"])

示例#9

0

显示文件

文件： si_authority.py 项目： sfrias/data

    def parse_dataframes(
        self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        # Rename the appropriate columns
        data = dataframes[0].rename(
            columns={
                "Date": "date",
                "Tested (all)": "total_tested",
                "Tested (daily)": "new_tested",
                "Positive (all)": "total_confirmed",
                "Positive (daily)": "new_confirmed",
                "All hospitalized on certain day": "current_hospitalized",
                "All persons in intensive care on certain day": "active_intensive_care",
                "Discharged": "recovered",
                "Deaths (all)": "total_deceased",
                "Deaths (daily)": "new_deceased",
            }
        )

        # Make sure all records have the country code
        data["country_code"] = "SI"

        # Make sure that the date column is a string
        data.date = data.date.astype(str)

        # Compute the cumsum counts
        data = grouped_cumsum(
            data,
            ["country_code", "date"],
            skip=[
                col
                for col in data.columns
                if any(kword in col for kword in ("new", "total", "active"))
            ],
        )

        # Output the results
        return data

示例#10

0

显示文件

    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = None
        ordered_columns = [
            "confirmed", "deceased", "tested", "hospitalized", "intensive_care"
        ]
        for column_name, df in zip(ordered_columns, dataframes):
            df = df.rename(columns={"Fecha": "date"}).set_index("date")
            df = pivot_table(df, pivot_name="match_string").rename(
                columns={"value": column_name})
            if data is None:
                data = df
            else:
                data = data.merge(df, how="left")

        # Compute the cumsum of data
        data = grouped_cumsum(data, ["match_string", "date"])
        data["country_code"] = "MX"

        # Country-level have a specific label
        data.loc[data.match_string == "Nacional", "key"] = "MX"

        return data

示例#11

0

显示文件

    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename the appropriate columns
        data = dataframes[0].rename(
            columns={
                "jour": "date",
                "dep": "subregion2_code",
                "p": "confirmed",
                "t": "tested",
                "incid_hosp": "hospitalized",
                "incid_dc": "deceased",
                "incid_rad": "recovered",
            })

        # Add subregion1_code field to all records
        data["subregion1_code"] = ""

        # Adjust for special regions
        region_adjust_map = {
            "971": "GUA",
            "972": "MQ",
            "973": "GF",
            "974": "LRE",
            "976": "MAY"
        }
        for subregion2_code, subregion1_code in region_adjust_map.items():
            mask = data.subregion2_code == subregion2_code
            data.loc[mask, "subregion2_code"] = None
            data.loc[mask, "subregion1_code"] = subregion1_code

        # Get date in ISO format
        data.date = data.date.astype(str)

        # Get keys from metadata auxiliary table
        data["country_code"] = "FR"
        subregion1_mask = data.subregion2_code.isna()
        data1 = data[subregion1_mask].merge(aux["metadata"],
                                            on=("subregion1_code",
                                                "subregion2_code"))
        data2 = data[~subregion1_mask].merge(aux["metadata"],
                                             on="subregion2_code")
        data = concat([data1, data2])

        # We only need to keep key-date pair for identification
        keep_columns = [
            "date", "key", "confirmed", "tested", "deceased", "hospitalized"
        ]
        data = data[[col for col in data.columns if col in keep_columns]]

        # Compute the daily counts
        data = grouped_cumsum(data, ["key", "date"])

        # Group by level 2 region, and add the parts
        l2 = data.copy()
        l2["key"] = l2.key.apply(lambda x: "_".join(x.split("_")[:2]))
        l2 = l2.groupby(["key", "date"]).sum().reset_index()

        # Group by country level, and add the parts
        l1 = l2.copy().drop(columns=['key'])
        l1 = l1.groupby("date").sum().reset_index()
        l1["key"] = "FR"

        # Output the results
        return concat([l2, data])