示例#1
0
文件: xx_dxy.py 项目: zhanghegui/data
    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        data = dataframes[0]

        # Adjust 7 hour difference between China's GMT+8 and GMT+1
        data["date"] = data["updateTime"].apply(
            lambda date: timezone_adjust(date, 7))

        # Rename the appropriate columns
        data = data.rename(
            columns={
                "countryEnglishName": "country_name",
                "provinceEnglishName": "match_string",
                "province_confirmedCount": "confirmed",
                "province_deadCount": "deceased",
                "province_curedCount": "recovered",
            })

        # Filter specific country data only
        data = data[data["country_name"] == parse_opts["country_name"]]

        # This is time series data, get only the last snapshot of each day
        data = (data.sort_values("updateTime").groupby(
            ["date", "country_name", "match_string"]).last().reset_index())

        keep_columns = [
            "date",
            "country_name",
            "match_string",
            "confirmed",
            "deceased",
            "recovered",
        ]
        return grouped_diff(data[keep_columns],
                            ["country_name", "match_string", "date"])
示例#2
0
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:
        data = dataframes[0]

        # Adjust 7 hour difference between China's GMT+8 and GMT+1
        data["date"] = data["updateTime"].apply(lambda date: timezone_adjust(date, 7))

        # Rename the appropriate columns
        data = data.rename(
            columns={
                "countryEnglishName": "country_name",
                "provinceEnglishName": "match_string",
                "province_confirmedCount": "total_confirmed",
                "province_deadCount": "total_deceased",
                "province_curedCount": "total_recovered",
            }
        )

        # Filter specific country data only
        data = data[data["country_name"] == parse_opts["country_name"]]

        # This is time series data, get only the last snapshot of each day
        data = (
            data.sort_values("updateTime")
            .groupby(["date", "country_name", "match_string"])
            .last()
            .reset_index()
        )

        # A couple of regions are reported using conflicting country codes, harmonize them here so
        # we avoid repeated regions
        data["key"] = None
        data.loc[data["match_string"] == "Taiwan", "key"] = "TW"
        data.loc[data["match_string"] == "Hong Kong", "key"] = "HK"

        keep_columns = [
            "key",
            "date",
            "country_name",
            "match_string",
            "total_confirmed",
            "total_deceased",
            "total_recovered",
        ]
        return data[keep_columns]