Exemplo n.º 1
0
    def subset(self,
               country,
               province=None,
               start_date=None,
               end_date=None,
               population=None):
        """
        Return the subset of dataset with Recovered > 0.

        Args:
            country (str): country name or ISO3 code
            province (str or None): province name
            start_date (str or None): start date, like 22Jan2020
            end_date (str or None): end date, like 01Feb2020
            population (int or None): population value

        Returns:
            pandas.DataFrame
                Index
                    reset index
                Columns
                    - Date (pd.Timestamp): Observation date
                    - Confirmed (int): the number of confirmed cases
                    - Infected (int): the number of currently infected cases
                    - Fatal (int): the number of fatal cases
                    - Recovered (int): the number of recovered cases (> 0)
                    - Susceptible (int): the number of susceptible cases, if calculated

        Note:
            If @population (high priority) is not None or population values are registered in subset,
            the number of susceptible cases will be calculated.
        """
        country_alias = self.ensure_country_name(country)
        # Subset with area, start/end date
        subset_df = self._subset(country=country,
                                 province=province,
                                 start_date=start_date,
                                 end_date=end_date)
        if subset_df.empty:
            raise SubsetNotFoundError(country=country,
                                      country_alias=country_alias,
                                      province=province,
                                      start_date=start_date,
                                      end_date=end_date)
        # Calculate Susceptible
        df = self._calculate_susceptible(subset_df, population)
        # Select records where Recovered > 0
        df = df.loc[df[self.R] > 0, :].reset_index(drop=True)
        if df.empty:
            raise SubsetNotFoundError(country=country,
                                      country_alias=country_alias,
                                      province=province,
                                      start_date=start_date,
                                      end_date=end_date,
                                      message="with 'Recovered > 0'") from None
        return df
Exemplo n.º 2
0
    def _subset_by_area(self, country, province=None):
        """
        Return subset for the country/province.

        Args:
            country (str): country name
            province (str or None): province name or None (country level data)

        Returns:
            pandas.DataFrame: subset for the country/province, columns are not changed

        Raises:
            SubsetNotFoundError: no records were found for the condition
        """
        # Country level
        if province is None or province == self.UNKNOWN:
            df = self.layer(country=None)
            country_alias = self.ensure_country_name(country)
            df = df.loc[df[self.COUNTRY] == country_alias]
            return df.reset_index(drop=True)
        # Province level
        df = self.layer(country=country)
        df = df.loc[df[self.PROVINCE] == province]
        if df.empty:
            raise SubsetNotFoundError(country=country)
        return df.reset_index(drop=True)
Exemplo n.º 3
0
    def subset(self, country, **kwargs):
        """
        Create a subset for a country.

        Args:
            country (str): country name or ISO 3166-1 alpha-3, like JPN
            kwargs: the other arguments will be ignored in the latest version.

        Raises:
            covsirphy.SubsetNotFoundError: no records were found

        Returns:
            pandas.DataFrame
                Index
                    reset index
                Columns
                    - Date (pd.Timestamp): Observation date
                    - other column names are defined by OxCGRTData.COL_DICT
        """
        country_arg = country
        country = self.ensure_country_name(country)
        try:
            df = super().subset(country=country)
        except SubsetNotFoundError:
            raise SubsetNotFoundError(country=country_arg,
                                      country_alias=country) from None
        df = df.groupby(self.DATE).last().reset_index()
        return df.loc[:, self.OXCGRT_COLS_WITHOUT_COUNTRY]
Exemplo n.º 4
0
    def value(self, country, province=None, date=None):
        """
        Return the value of population in the place.

        Args:
            country (str): country name or ISO3 code
            province (str): province name
            date (str or None): observation date, like 01Jun2020

        Returns:
            int: population in the place

        Note:
            If @date is None, the created date of the instancewill be used
        """
        country_alias = self.ensure_country_name(country)
        try:
            df = self.subset(country=country,
                             province=province,
                             start_date=date,
                             end_date=date)
        except KeyError:
            raise SubsetNotFoundError(country=country,
                                      country_alias=country_alias,
                                      province=province,
                                      date=date)
        df = df.sort_values(self.DATE)
        return int(df.loc[df.index[-1], [self.N]].values[0])
Exemplo n.º 5
0
    def ensure_country_name(self, country):
        """
        Ensure that the country name is correct.
        If not, the correct country name will be found.

        Args:
            country (str): country name

        Returns:
            str: country name
        """
        df = self._ensure_dataframe(self._cleaned_df,
                                    name="the cleaned dataset",
                                    columns=[self.COUNTRY])
        selectable_set = set(df[self.COUNTRY].unique())
        # return country name as-is if selectable
        if country in selectable_set:
            return country
        # Convert country name
        converted = coco.convert(country, to="name_short", not_found=None)
        # Additional abbr
        abbr_dict = {
            "Congo Republic": "Republic of the Congo",
            "DR Congo": "Democratic Republic of the Congo",
            "UK": "United Kingdom",
            "Vatican": "Holy See",
        }
        name = abbr_dict.get(converted, converted)
        # Return the name if registered in the dataset
        if name in selectable_set:
            return name
        raise SubsetNotFoundError(country=country, country_alias=name)
Exemplo n.º 6
0
    def retrieve(self, country):
        """
        Retrieve the dataset of the country from the local file or the server.

        Args:
            country (str): country name

        Returns:
            pandas.DataFrame: retrieved data
                Index
                    reset index
                Columns
                    - Country (pandas.Category): country name
                    - Year (int): year
                    - Sex (str): Female/Male
                    - Age (int): age
                    - Population (int): population value
        """
        if not self._raw.empty and country in self._raw[self.COUNTRY].unique():
            df = self._raw.copy()
            df = df.loc[df[self.COUNTRY] == country, :].reset_index(drop=True)
        else:
            # Retrieve from World Bank Open Data
            try:
                df = self._retrieve_from_server(country)
            except SubsetNotFoundError:
                raise SubsetNotFoundError(country=country) from None
            # Add to raw dataset
            self._raw = pd.concat([self._raw, df], ignore_index=True, axis=0)
            self._raw.to_csv(self._filename, index=False)
        # Data types
        cat_cols, int_cols = [self.COUNTRY, self.SEX], [self.AGE, self.N]
        df[cat_cols] = df[cat_cols].astype("category")
        df[int_cols] = df[int_cols].astype(np.int64)
        return df
Exemplo n.º 7
0
    def _colored_map_country(self, country, variable, title, date, **kwargs):
        """
        Create country-specific colored map to show the values at province level.

        Args:
            country (str): country name
            variable (str): variable name to show
            title (str): title of the figure
            date (str or None): date of the records or None (the last value)
            kwargs: arguments of covsirphy.ColoredMap() and covsirphy.ColoredMap.plot()
        """
        df = self._cleaned_df.copy()
        country_alias = self.ensure_country_name(country)
        # Check variable name
        if variable not in df.columns:
            candidates = [col for col in df.columns if col not in self.AREA_ABBR_COLS]
            raise UnExpectedValueError(name="variable", value=variable, candidates=candidates)
        # Select country-specific data
        self._ensure_dataframe(df, name="cleaned dataset", columns=[self.COUNTRY, self.PROVINCE])
        df = df.loc[df[self.COUNTRY] == country_alias]
        df = df.loc[df[self.PROVINCE] != self.UNKNOWN]
        if df.empty:
            raise SubsetNotFoundError(
                country=country, country_alias=country_alias, message="at province level")
        # Select date
        if date is not None:
            self._ensure_dataframe(df, name="cleaned dataset", columns=[self.DATE])
            df = df.loc[df[self.DATE] == pd.to_datetime(date)]
        df = df.groupby(self.PROVINCE).last().reset_index()
        # Plotting
        df[self.COUNTRY] = country_alias
        df.rename(columns={variable: "Value"}, inplace=True)
        self._colored_map(title=title, data=df, level=self.PROVINCE, **kwargs)
Exemplo n.º 8
0
    def subset(self, country, province=None, start_date=None, end_date=None):
        """
        Return subset with country/province name and start/end date.

        Args:
            country (str): country name or ISO3 code
            province (str or None): province name
            start_date (str or None): start date, like 22Jan2020
            end_date (str or None): end date, like 01Feb2020

        Returns:
            pandas.DataFrame
                Index
                    reset index
                Columns
                    without ISO3, Country, Province column

        Raises:
            SubsetNotFoundError: no records were found for the condition
        """
        country_alias = self.ensure_country_name(country, errors="coerce")
        try:
            df = self._subset_by_area(country=country, province=province)
        except SubsetNotFoundError:
            raise SubsetNotFoundError(country=country,
                                      country_alias=country_alias,
                                      province=province) from None
        df = df.drop([self.COUNTRY, self.ISO3, self.PROVINCE],
                     axis=1,
                     errors="ignore")
        # Subset with Start/end date
        if start_date is None and end_date is None:
            return df.reset_index(drop=True)
        df = self._ensure_dataframe(df,
                                    name="the cleaned dataset",
                                    columns=[self.DATE])
        series = df[self.DATE].copy()
        start_obj = self.date_obj(date_str=start_date, default=series.min())
        end_obj = self.date_obj(date_str=end_date, default=series.max())
        df = df.loc[(start_obj <= series) & (series <= end_obj), :]
        if df.empty:
            raise SubsetNotFoundError(country=country,
                                      country_alias=country_alias,
                                      province=province,
                                      start_date=start_date,
                                      end_date=end_date) from None
        return df.reset_index(drop=True)
Exemplo n.º 9
0
    def _retrieve_from_server(self, country):
        """
        Retrieve the dataset of the country from the server.

        Args:
            country (str): country name

        Returns:
            pandas.DataFrame: retrieved data
                Index
                    reset index
                Columns
                    - Country (object): country name
                    - Year (int): year
                    - Sex (object): Female/Male
                    - Age (object): age
                    - Population (object): population value
        """
        if self.verbose:
            print(
                f"Retrieving population pyramid dataset ({country}) from https://data.worldbank.org/"
            )
        # Retrieve from World Bank Open Data
        iso3_code = coco.convert(country, to="ISO3", not_found=None)
        try:
            df = wbdata.get_dataframe(self.INDICATOR_DICT,
                                      country=iso3_code,
                                      convert_date=True)
        except RuntimeError:
            raise SubsetNotFoundError(country=country) from None
        # Preprocessing (-> Country, Population, Min, Max, Sex, Year)
        df = df.stack().reset_index()
        df.insert(0, self.COUNTRY, country)
        df.columns = [self.COUNTRY, "Date", "Attribute", self.N]
        df2 = df["Attribute"].str.split("-", expand=True)
        df2.columns = ["Min", "Max", self.SEX]
        df = pd.concat([df.drop("Attribute", axis=1), df2], axis=1)
        df["Max"] = df["Max"].replace("UP", self.ELDEST)
        for col in [self.N, "Min", "Max"]:
            df[col] = pd.to_numeric(df[col], downcast="integer")
        df[self.SEX].replace({"FE": "Female", "MA": "Male"}, inplace=True)
        df[self.YEAR] = df["Date"].dt.year
        df = df.drop("Date", axis=1)
        # Preprocessing (-> Country, Year, Sex, Age, Population)
        df[self.AGE] = df[["Min",
                           "Max"]].apply(lambda x: range(x[0], x[1] + 1),
                                         axis=1)
        df[self.N] = df[["Min", "Max", self.N]].apply(lambda x: x[2] /
                                                      (x[1] - x[0] + 1),
                                                      axis=1)
        df = df.explode(self.AGE).reset_index(drop=True)
        df[self.N] = df[self.N].astype(np.int64)
        return df.loc[:, self.PYRAMID_COLS]
Exemplo n.º 10
0
    def records(self,
                country,
                province=None,
                start_date=None,
                end_date=None,
                auto_complement=True,
                **kwargs):
        """
        Return the subset. If necessary, complemention will be performed.

        Args:
            country (str): country name or ISO3 code
            province (str or None): province name
            start_date (str or None): start date, like 22Jan2020
            end_date (str or None): end date, like 01Feb2020
            auto_complement (bool): if True and necessary, the number of cases will be complemented
            kwargs: the other arguments of complement

        Returns:
            pandas.DataFrame
                Index
                    reset index
                Columns
                    without ISO3, Country, Province column
        """
        country_alias = self.ensure_country_name(country)
        subset_arg_dict = {
            "country": country,
            "province": province,
            "start_date": start_date,
            "end_date": end_date
        }
        if auto_complement:
            try:
                df, is_complemented = self.subset_complement(
                    **subset_arg_dict, **kwargs)
                if not df.empty:
                    return (df, is_complemented)
            except NotImplementedError:
                pass
        try:
            return (self.subset(**subset_arg_dict), False)
        except SubsetNotFoundError:
            raise SubsetNotFoundError(country=country,
                                      country_alias=country_alias,
                                      province=province,
                                      start_date=start_date,
                                      end_date=end_date) from None
Exemplo n.º 11
0
    def subset(self, country, product=None, start_date=None, end_date=None):
        """
        Return subset of the country/province and start/end date.

        Args:
            country (str or None): country name or ISO3 code
            product (str or None): product name
            start_date (str or None): start date, like 22Jan2020
            end_date (str or None): end date, like 01Feb2020

        Returns:
            pandas.DataFrame
                Index
                    reset index
                Columns
                    - Date (pandas.TimeStamp): observation date
                    - Vaccinations (int): the number of vaccinations
        """
        df = self._cleaned_df.copy()
        # Subset by country
        country_alias = self.ensure_country_name(country)
        df = df.loc[df[self.COUNTRY] == country_alias]
        # Subset by product name
        if product is not None:
            df = df.loc[df[self.PRODUCT] == product]
        # Subset with start date
        if start_date is not None:
            df = df.loc[df[self.DATE] >= self.date_obj(start_date)]
        # Subset with end date
        if end_date is not None:
            df = df.loc[df[self.DATE] <= self.date_obj(end_date)]
        # Resampling
        df = df.set_index(self.DATE).resample("D").sum().reset_index()
        # Fill in the blanks
        df[self.VAC] = df[self.VAC].replace(0, None)
        df[self.VAC] = df[self.VAC].fillna(method="ffill").fillna(0)
        # Check records were found
        if df.empty:
            raise SubsetNotFoundError(country=country,
                                      country_alias=country_alias,
                                      province=product,
                                      start_date=start_date,
                                      end_date=end_date)
        return df
Exemplo n.º 12
0
    def subset(self,
               country,
               province=None,
               start_date=None,
               end_date=None,
               dataset="COVID-19 Data Hub"):
        """
        Return subset of the country/province and start/end date.

        Args:
            country (str): country name or ISO3 code
            province (str or None): province name
            start_date (str or None): start date, like 22Jan2020
            end_date (str or None): end date, like 01Feb2020
            dataset (str): 'COVID-19 Data Hub' or 'Our World In Data'

        Returns:
            pandas.DataFrame
                Index
                    reset index
                Columns
                    - Date (pd.TimeStamp): Observation date
                    - Tests (int): the number of total tests performed
                    - Confirmed (int): the number of confirmed cases
        """
        country_alias = self.ensure_country_name(country)
        df = self._subset_by_area(country=country_alias,
                                  province=province,
                                  dataset=dataset)
        df = df.drop([self.COUNTRY, self.ISO3, self.PROVINCE], axis=1)
        # Subset with Start/end date
        if start_date is None and end_date is None:
            return df.reset_index(drop=True)
        series = df[self.DATE].copy()
        start_obj = self.date_obj(date_str=start_date, default=series.min())
        end_obj = self.date_obj(date_str=end_date, default=series.max())
        df = df.loc[(start_obj <= series) & (series <= end_obj), :]
        if df.empty:
            raise SubsetNotFoundError(country=country,
                                      country_alias=country_alias,
                                      province=province,
                                      start_date=start_date,
                                      end_date=end_date)
        return df.reset_index(drop=True)
Exemplo n.º 13
0
    def layer(self, country=None):
        """
        Return the cleaned data at the selected layer.

        Args:
            country (str or None): country name or None (country level data or country-specific dataset)

        Returns:
            pandas.DataFrame:
                Index
                    reset index
                Columns
                - Country (str): country names
                - Province (str): province names (or removed when country level data)
                - any other columns of the cleaned data

        Raises:
            SubsetNotFoundError: no records were found for the country (when @country is not None)
            KeyError: @country was None, but country names were not registered in the dataset

        Note:
            When @country is None, country level data will be returned.
            When @country is a country name, province level data in the selected country will be returned.
        """
        df = self._cleaned_df.copy()
        self._ensure_dataframe(df,
                               name="the cleaned dataset",
                               columns=[self.COUNTRY])
        if self.PROVINCE not in df:
            df[self.PROVINCE] = self.UNKNOWN
        df[self.AREA_COLUMNS] = df[self.AREA_COLUMNS].astype(str)
        # Country level data
        if country is None:
            df = df.loc[df[self.PROVINCE] == self.UNKNOWN]
            return df.drop(self.PROVINCE, axis=1).reset_index(drop=True)
        # Province level data at the selected country
        country_alias = self.ensure_country_name(country, errors="coerce")
        df = df.loc[df[self.COUNTRY] == country_alias]
        if df.empty:
            raise SubsetNotFoundError(country=country,
                                      country_alias=country_alias) from None
        df = df.loc[df[self.PROVINCE] != self.UNKNOWN]
        return df.reset_index(drop=True)
Exemplo n.º 14
0
    def specialized(self, model=None, country=None, province=None):
        """
        Return dimensional records with model variables.

        Args:
            model (cs.ModelBase or None): the first ODE model
            country (str or None): country name
            province (str or None): province name

        Note:
            If country is None, the name of the model will be used.
            If province is None, '-' will be used.
        """
        country, province = self._model_to_area(
            model=model, country=country, province=province)
        try:
            return self._specialized_dict[country][province]
        except KeyError:
            raise SubsetNotFoundError(country=country, province=province)
Exemplo n.º 15
0
    def non_dim(self, model=None, country=None, province=None):
        """
        Return non-dimensional data.

        Args:
            model (cs.ModelBase or None): the first ODE model
            country (str or None): country name
            province (str or None): province name

        Note:
            If country is None, the name of the model will be used.
            If province is None, '-' will be used.
        """
        country, province = self._model_to_area(
            model=model, country=country, province=province)
        try:
            return self.nondim_dict[country][province]
        except KeyError:
            raise SubsetNotFoundError(country=country, province=province)
Exemplo n.º 16
0
    def subset(self, country, province=None, start_date=None, end_date=None):
        """
        Return subset of the country/province and start/end date.

        Args:
            country (str): country name or ISO3 code
            province (str or None): province name
            start_date (str or None): start date, like 22Jan2020
            end_date (str or None): end date, like 01Feb2020

        Returns:
            pandas.DataFrame
                Index
                    reset index
                Columns
                    - Date (pd.Timestamp): Observation date
                    - Tests (int): the number of total tests performed
                    - Tests_diff (int): daily number of tests on date
                    - Confirmed (int): the number of confirmed cases
        """
        country_alias = self.ensure_country_name(country)
        df = self._subset_select(country=country_alias,
                                 province=province or self.UNKNOWN)
        # Calculate Tests_diff
        df[self.T_DIFF] = df[self.TESTS].diff().fillna(0)
        df.loc[df[self.T_DIFF] < 0, self.T_DIFF] = 0
        df[self.T_DIFF] = df[self.T_DIFF].astype(np.int64)
        df = df.loc[:, [self.DATE, self.TESTS, self.T_DIFF, self.C]]
        # Subset with Start/end date
        if start_date is None and end_date is None:
            return df.reset_index(drop=True)
        series = df[self.DATE].copy()
        start_obj = self._ensure_date(start_date, default=series.min())
        end_obj = self._ensure_date(end_date, default=series.max())
        df = df.loc[(start_obj <= series) & (series <= end_obj), :]
        if df.empty:
            raise SubsetNotFoundError(country=country,
                                      country_alias=country_alias,
                                      province=province,
                                      start_date=start_date,
                                      end_date=end_date)
        return df.reset_index(drop=True)
Exemplo n.º 17
0
    def subset(self, country, product=None, start_date=None, end_date=None):
        """
        Return subset of the country/province and start/end date.

        Args:
            country (str or None): country name or ISO3 code
            product (str or None): product name
            start_date (str or None): start date, like 22Jan2020
            end_date (str or None): end date, like 01Feb2020

        Returns:
            pandas.DataFrame
                Index
                    reset index
                Columns
                    - Date (pandas.TimeStamp): observation date
                    - Vaccinations (int): the number of vaccinations
                    - Vaccinated_once (int): cumulative number of people who received at least one vaccine dose
                    - Vaccinated_full (int): cumulative number of people who received all doses prescrived by the protocol
        """
        df = self._cleaned_df.copy()
        # Subset by country
        country_alias = self.ensure_country_name(country)
        df = df.loc[df[self.COUNTRY] == country_alias]
        # Subset by product name
        if product is not None:
            df = df.loc[df[self.PRODUCT] == product]
        # Subset with start date
        if start_date is not None:
            df = df.loc[df[self.DATE] >= self._ensure_date(start_date)]
        # Subset with end date
        if end_date is not None:
            df = df.loc[df[self.DATE] <= self._ensure_date(end_date)]
        # Check records were found
        if df.empty:
            raise SubsetNotFoundError(country=country,
                                      country_alias=country_alias,
                                      province=product,
                                      start_date=start_date,
                                      end_date=end_date)
        return df.loc[:, self.VAC_SUBSET_COLS].reset_index(drop=True)
Exemplo n.º 18
0
    def subset(self, country, province=None):
        """
        Return subset of the country/province.

        Args:
            country (str): country name or ISO3 code
            province (str or None): province name

        Returns:
            pandas.DataFrame
                Index
                    reset index
                Columns
                    - Hospitalized_date (pandas.TimeStamp or NT)
                    - Confirmation_date (pandas.TimeStamp or NT)
                    - Outcome_date (pandas.TimeStamp or NT)
                    - Confirmed (bool)
                    - Infected (bool)
                    - Recovered (bool)
                    - Fatal (bool)
                    - Symtoms (str)
                    - Chronic_disease (str)
                    - Age (int or None)
                    - Sex (str)
        """
        df = self._cleaned_df.copy()
        # Subset by country name
        country = self.ensure_country_name(country)
        df = df.loc[df[self.COUNTRY] == country]
        # Subset by province name
        if province not in (None, self.UNKNOWN):
            df = df.loc[df[self.PROVINCE] == province]
        # Check records are registered
        country_alias = self.ensure_country_name(country)
        if df.empty:
            raise SubsetNotFoundError(country=country,
                                      country_alias=country_alias,
                                      province=province)
        df = df.drop([self.COUNTRY, self.PROVINCE], axis=1)
        return df.reset_index(drop=True)
Exemplo n.º 19
0
    def ensure_country_name(self, country, errors="raise"):
        """
        Ensure that the country name is correct.
        If not, the correct country name will be found.

        Args:
            country (str): country name
            errors (str): 'raise' or 'coerce'

        Returns:
            str: country name

        Raises:
            SubsetNotFoundError: no records were found for the country and @errors is 'raise'
        """
        df = self._cleaned_df.copy()
        self._ensure_dataframe(df,
                               name="the cleaned dataset",
                               columns=[self.COUNTRY])
        selectable_set = set(df[self.COUNTRY].unique())
        # return country name as-is if selectable
        if country in selectable_set:
            return country
        # Convert country name
        warnings.simplefilter("ignore", FutureWarning)
        converted = coco.convert(country, to="name_short", not_found=None)
        # Additional abbr
        abbr_dict = {
            "Congo Republic": "Republic of the Congo",
            "DR Congo": "Democratic Republic of the Congo",
            "UK": "United Kingdom",
            "Vatican": "Holy See",
        }
        name = abbr_dict.get(converted, converted)
        # Return the name if registered in the dataset
        if name in selectable_set:
            return name
        if errors == "raise":
            raise SubsetNotFoundError(country=country, country_alias=name)
Exemplo n.º 20
0
    def subset(self, country, **kwargs):
        """
        Create a subset for a country.

        Args:
            country (str): country name or ISO 3166-1 alpha-3, like JPN
            kwargs: the other arguments will be ignored in the latest version.

        Raises:
            covsirphy.SubsetNotFoundError: no records were found

        Returns:
            pandas.DataFrame
                Index
                    reset index
                Columns
                    - Date (pandas.Timestamp): Observation date
                    - School_closing
                    - Workplace_closing
                    - Cancel_events
                    - Gatherings_restrictions
                    - Transport_closing
                    - Stay_home_restrictions
                    - Internal_movement_restrictions
                    - International_movement_restrictions
                    - Information_campaigns
                    - Testing_policy
                    - Contact_tracing
                    - Stringency_index
        """
        country_arg = country
        country = self.ensure_country_name(country)
        try:
            df = super().subset(country=country)
        except SubsetNotFoundError:
            raise SubsetNotFoundError(country=country_arg,
                                      country_alias=country) from None
        df = df.groupby(self.DATE).last().reset_index()
        return df.loc[:, self.SUBSET_COLS]
Exemplo n.º 21
0
    def records(self,
                country,
                province=None,
                start_date=None,
                end_date=None,
                population=None,
                auto_complement=True,
                **kwargs):
        """
        JHU-style dataset for the area from the start date to the end date.
        Records with Recovered > 0 will be selected.

        Args:
            country(str): country name or ISO3 code
            province(str or None): province name
            start_date(str or None): start date, like 22Jan2020
            end_date(str or None): end date, like 01Feb2020
            population(int or None): population value
            auto_complement (bool): if True and necessary, the number of cases will be complemented
            kwargs: the other arguments of JHUData.subset_complement()

        Returns:
            tuple(pandas.DataFrame, bool):
                pandas.DataFrame:

                    Index
                        reset index
                    Columns
                        - Date(pd.TimeStamp): Observation date
                        - Confirmed(int): the number of confirmed cases
                        - Infected(int): the number of currently infected cases
                        - Fatal(int): the number of fatal cases
                        - Recovered (int): the number of recovered cases ( > 0)
                        - Susceptible(int): the number of susceptible cases, if calculated
                str or bool: kind of complement or False

        Note:
            - If @ population is not None, the number of susceptible cases will be calculated.
            - If necessary and @auto_complement is True, complement recovered data.
        """
        country_alias = self.ensure_country_name(country)
        subset_arg_dict = {
            "country": country,
            "province": province,
            "start_date": start_date,
            "end_date": end_date,
            "population": population,
        }
        if auto_complement:
            df, is_complemented = self.subset_complement(
                **subset_arg_dict, **kwargs)
            if not df.empty:
                return (df, is_complemented)
        try:
            return (self.subset(**subset_arg_dict), False)
        except ValueError:
            raise SubsetNotFoundError(country=country,
                                      country_alias=country_alias,
                                      province=province,
                                      start_date=start_date,
                                      end_date=end_date,
                                      message="with 'Recovered > 0'") from None
Exemplo n.º 22
0
    def subset_complement(self,
                          country,
                          province=None,
                          start_date=None,
                          end_date=None,
                          population=None,
                          **kwargs):
        """
        Return the subset of dataset and complement recovered data, if necessary.
        Records with Recovered > 0 will be selected.

        Args:
            country(str): country name or ISO3 code
            province(str or None): province name
            start_date(str or None): start date, like 22Jan2020
            end_date(str or None): end date, like 01Feb2020
            population(int or None): population value
            kwargs: keyword arguments of JHUDataComplementHandler(), control factors of complement

        Returns:
            tuple(pandas.DataFrame, str or bool):
                pandas.DataFrame:
                    Index
                        reset index
                    Columns
                        - Date(pd.TimeStamp): Observation date
                        - Confirmed(int): the number of confirmed cases
                        - Infected(int): the number of currently infected cases
                        - Fatal(int): the number of fatal cases
                        - Recovered (int): the number of recovered cases ( > 0)
                        - Susceptible(int): the number of susceptible cases, if calculated
                str or bool: kind of complement or False

        Note:
            If @population is not None, the number of susceptible cases will be calculated.
        """
        # Subset with area, start/end date and calculate Susceptible
        country_alias = self.ensure_country_name(country)
        subset_df = self._subset(country=country,
                                 province=province,
                                 start_date=start_date,
                                 end_date=end_date)
        if subset_df.empty:
            raise SubsetNotFoundError(country=country,
                                      country_alias=country_alias,
                                      province=province,
                                      start_date=start_date,
                                      end_date=end_date) from None
        # Complement, if necessary
        self._recovery_period = self._recovery_period or self.calculate_recovery_period(
        )
        handler = JHUDataComplementHandler(
            recovery_period=self._recovery_period, **kwargs)
        df, status, _ = handler.run(subset_df)
        # Calculate Susceptible
        df = self._calculate_susceptible(df, population)
        # Kind of complement or False
        is_complemented = status or False
        # Select records where Recovered > 0
        df = df.loc[df[self.R] > 0, :].reset_index(drop=True)
        return (df, is_complemented)
Exemplo n.º 23
0
    def show_complement(self,
                        country=None,
                        province=None,
                        start_date=None,
                        end_date=None,
                        **kwargs):
        """
        To monitor effectivity and safety of complement on JHU subset,
        we need to know what kind of complement was done for JHU subset
        for each country (if country/countries specified) or for all countries.

        Args:
            country (str or list[str] or None): country/countries name or None (all countries)
            province(str or None): province name
            start_date(str or None): start date, like 22Jan2020
            end_date(str or None): end date, like 01Feb2020
            kwargs: keyword arguments of JHUDataComplementHandler(), control factors of complement

        Raises:
            ValueError: @province was specified when @country is not a string
            covsirphy.SubsetNotFoundError: No records were registered for the area/dates

        Returns:
            pandas.DataFrame

                Index
                    reset index
                Columns
                    - country (str): country name
                    - province (str): province name
                    - Monotonic_confirmed (bool): True if applied for confirmed cases or False otherwise
                    - Monotonic_fatal (bool): True if applied for fatal cases or False otherwise
                    - Monotonic_recovered (bool): True if applied for recovered or False otherwise
                    - Full_recovered (bool): True if applied for recovered or False otherwise
                    - Partial_recovered (bool): True if applied for recovered or False otherwise
        """
        self._recovery_period = self._recovery_period or self.calculate_recovery_period(
        )
        # Area name
        if country is None:
            country = [
                c for c in self._cleaned_df[self.COUNTRY].unique()
                if c != "Others"
            ]
        province = province or self.UNKNOWN
        if not isinstance(country, str) and province != self.UNKNOWN:
            raise ValueError(
                "@province cannot be specified when @country is not a string.")
        if not isinstance(country, list):
            country = [country]
        # Create complement handler
        handler = JHUDataComplementHandler(
            recovery_period=self._recovery_period, **kwargs)
        # Check each country
        complement_df = pd.DataFrame(columns=[
            self.COUNTRY, self.PROVINCE,
            *JHUDataComplementHandler.SHOW_COMPLEMENT_FULL_COLS
        ])
        complement_df.set_index(self.COUNTRY, inplace=True)
        for cur_country in country:
            subset_df = self._subset(country=cur_country,
                                     province=province,
                                     start_date=start_date,
                                     end_date=end_date)
            if subset_df.empty:
                raise SubsetNotFoundError(country=cur_country,
                                          province=province,
                                          start_date=start_date,
                                          end_date=end_date)
            *_, complement_dict = handler.run(subset_df)
            complement_dict_values = pd.Series(complement_dict.values(),
                                               dtype=bool).values
            complement_df.loc[cur_country] = [
                province, *complement_dict_values
            ]
        return complement_df.reset_index()