def fit(
        self,
        X: pd.DataFrame,
        Y: pd.DataFrame,
    ):
        """
        Fit a poisson regression model each for the cases using active_cases and percentage_susceptible at time t-1, and another model
        for removed using active_cases at time t-1.
        Args:
            X (pd.DataFrame): Dataframe for given region of predictor variables containing columns date, active_cases, percent_susceptible. If including vaccination data then column percent_cvaccine must also be included.
            Y (pd.DataFrame): Dataframe for given region of response variables containing columns cases, removed
        """
        if self.most_recent_days is not None:
            X = X.tail(self.most_recent_days)
            Y = Y.tail(self.most_recent_days)

        self.X_original = X.copy()
        self.Y_original = Y.copy()

        self.N = self.X_original["population"].iloc[-1]

        # Get average proportion of vaccinated individuals over last 2 weeks
        if self.vaccination:
            self.average_perc_vaccinated = X["percent_cvaccine"].tail(
                14).mean()
            self.average_vaccinated = self.average_perc_vaccinated * self.N
        else:
            self.average_perc_vaccinated = 0
            self.average_vaccinated = 0

        # Separate data for each model
        self.X_cases = X[[
            "log_active_cases_yesterday", "log_percent_susceptible_yesterday"
        ]].copy()
        self.Y_cases = Y["cases"]
        self.X_removed = X[["log_active_cases_yesterday"]].copy()
        self.Y_removed = Y["removed"]

        # Setup terms for covid19 data to use in GLM
        term = s if self.use_spline else l
        terms_cases = term(0, lam=self.lam) + term(1, lam=self.lam)
        terms_removed = term(0, lam=self.lam)

        # Model new cases data using infections and percentage susceptible at time t-1
        self.poisson_gam_cases = PoissonGAM(terms_cases, verbose=self.verbose)
        self.poisson_gam_cases.fit(self.X_cases, self.Y_cases)

        # Model removed cases using infections at time t-1
        self.poisson_gam_removed = PoissonGAM(terms_removed,
                                              verbose=self.verbose)
        self.poisson_gam_removed.fit(self.X_removed, self.Y_removed)

        return
    def fit(
        self,
        X: pd.DataFrame,
        Y: pd.DataFrame,
    ):
        """
        Fit a poisson regression model each for the cases using active_cases and percentage_susceptible at time t-1, and another model
        for removed using active_cases at time t-1.

        Args:
            X (pd.DataFrame): Dataframe for given region of predictor variables containing columns date, province, active_cases, percent_susceptible,
                              and all columns for provinces for {province_name}_active_cases_yesterday, {province_name}_percent_susceptible_yesterday,
                              as well as all log features
            Y (pd.DataFrame): Dataframe for given region of response variables containing columns date, province, cases, removed
        """
        self.X_original = X.copy()
        self.Y_original = Y.copy()
        self.provinces = X["province"].unique()

        # Fit model for each province
        self.X_cases = {}
        self.Y_cases = {}
        self.X_removed = {}
        self.Y_removed = {}
        self.poisson_gam_cases = {}
        self.poisson_gam_removed = {}

        for province in self.provinces:
            # Remove extra columns for given province in form {province}_column_name
            cols_drop = X.filter(regex=province, axis=1).columns
            X_province = X.query(f"province == '{province}'").drop(cols_drop,
                                                                   axis=1)
            Y_province = Y.query(f"province == '{province}'")

            # Store case dataframe used to train model for each province
            self.X_cases[province] = X_province.filter(
                regex=
                r"(log_active_cases_yesterday|log_percent_susceptible_yesterday)"
            )
            self.Y_cases[province] = Y_province["cases"]

            # Add terms for each province I_t-1 and Z_t-1. Either splines or linear terms
            if self.use_splines:
                terms = s(0, lam=self.lam_main) + s(1, lam=self.lam_main)
                for i in range(1, len(self.provinces)):
                    terms += s(i * 2, lam=self.lam_other) + s(
                        i * 2 + 1, lam=self.lam_other)
            else:
                terms = l(0, lam=self.lam_main) + l(1, lam=self.lam_other)
                for i in range(1, len(self.provinces)):
                    terms += l(i * 2, lam=self.lam_other) + l(
                        i * 2 + 1, lam=self.lam_other)

            # Fit cases model for province
            cases_model = PoissonGAM(terms, verbose=self.verbose)
            cases_model.fit(self.X_cases[province], self.Y_cases[province])
            self.poisson_gam_cases[province] = cases_model

            # Store remove dataframe used to train model for each province
            self.X_removed[province] = X_province.filter(
                regex=r"log_active_cases_yesterday")
            self.Y_removed[province] = Y_province["removed"]

            # Add terms for each province I_t-1
            terms = l(0, lam=self.lam_main)
            for i in range(1, len(self.provinces)):
                terms += l(i, lam=self.lam_other)

            # Fit removed model for each province
            removed_model = PoissonGAM(terms, verbose=self.verbose)
            removed_model.fit(self.X_removed[province], self.Y_cases[province])
            self.poisson_gam_removed[province] = removed_model

        return
    def fit(
        self,
        X: pd.DataFrame,
        Y: pd.DataFrame,
    ):
        """
        Fit a poisson regression model each for the cases using active_cases and percentage_susceptible at time t-1, and another model
        for removed using active_cases at time t-1.
        Args:
            X (pd.DataFrame): Dataframe for given region of predictor variables containing columns date, active_cases, percent_susceptible
            Y (pd.DataFrame): Dataframe for given region of response variables containing columns cases, removed
        """
        # Remove days in data that are after the latest twitter data given
        if self.twitter_data is not None:
            remove_date = self.twitter_data["date"].max()
        else:
            remove_date = X["date"].max()

        X = X.query("date <= @remove_date")
        Y = Y.query("date <= @remove_date")

        self.X_original = X.copy()
        self.Y_original = Y.copy()

        # Separate data for each model
        self.X_cases = X[[
            "date", "log_active_cases_yesterday",
            "log_percent_susceptible_yesterday"
        ]].copy()
        self.Y_cases = Y["cases"]
        self.X_removed = X[["date", "log_active_cases_yesterday"]].copy()
        self.Y_removed = Y["removed"]

        # Preprocess twitter data by shifting it by twitter_offset days so each row contains the twitter data from twitter_offset days ago
        if self.twitter_data is not None:
            twitter_shifted = self.twitter_data.drop(
                ["date", "province"],
                axis=1).shift(periods=self.twitter_offset, axis=0)
            twitter_shifted.columns = [
                f"{col}_shifted" for col in twitter_shifted.columns
            ]
            twitter_shifted = twitter_shifted.assign(
                date=self.twitter_data["date"])

            # Add twitter data to use in both cases and removed models
            self.X_cases = self.X_cases.merge(twitter_shifted,
                                              how="left",
                                              on=["date"])
            self.X_removed = self.X_removed.merge(twitter_shifted,
                                                  how="left",
                                                  on=["date"])

        # Drop date columns not used anymore
        self.X_cases = self.X_cases.drop("date", axis=1)
        self.X_removed = self.X_removed.drop("date", axis=1)

        # Setup terms for covid19 data to use in GLM
        term = s if self.use_spline else l
        terms_cases = term(0, lam=self.lam) + term(1, lam=self.lam)
        terms_removed = term(0, lam=self.lam)

        # Add terms for twitter data
        twitter_cols = self.twitter_data.columns.drop(["date", "province"])
        for i in range(0, len(twitter_cols)):
            terms_cases = terms_cases + term(i + 2, lam=self.lam)
            terms_removed = terms_removed + term(i + 1, lam=self.lam)

        # Model new cases data using infections and percentage susceptible at time t-1
        self.poisson_gam_cases = PoissonGAM(terms_cases, verbose=self.verbose)
        self.poisson_gam_cases.fit(self.X_cases, self.Y_cases)

        # Model removed cases using infections at time t-1
        self.poisson_gam_removed = PoissonGAM(terms_removed,
                                              verbose=self.verbose)
        self.poisson_gam_removed.fit(self.X_removed, self.Y_removed)

        return
class StemPoissonRegressor:
    """
    Space-Time Epidemic model based on "Spatiotemporal Dynamics, Nowcasting and Forecasting of COVID-19 in the United States" (https://arxiv.org/abs/2004.14103)
    Fits two Poisson regression models to model the new cases and new deaths/recovered at time t.

    The first model for the new cases Y_t is modelled using the active cases I_t-1 and number of susceptible people S_t-1
    Y_t \sim Poisson(\mu_t) \\
    log(\mu_t) = \beta_{1t} + \beta_{2t}log(I_{t-1} + 1) + \alpha_tlog(S_{t-1}/N)   

    The second model for the new deaths/recovered \Delta D_t is modelled using the active cases I_t-1
    \Delta D_t \sim Poisson({\mu_t}^D) \\
    log({\mu_t}^D) = \beta_{1t}^D + \beta_{2t}^D log(I_{t-1} + 1)

    Attributes:
        X_original {pandas dataframe} -- Original X dataframe called on fit()
        Y_original {pandas dataframe} -- Original Y dataframe called on fit()
        X_cases {pandas dataframe} -- Transformed X dataframe used for fitting new cases model
        Y_case {pandas dataframe} -- Y dataframe used for fitting new cases model
        X_removed {pandas dataframe} -- Transformed X dataframe used for fitting new removed model
        Y_removed {pandas dataframe} -- Y dataframe used for fitting new removed model
        poisson_gam_cases {PoissonGAM model} -- Poisson regression model for new cases
        poisson_gam_removed {PoissonGAM model} -- Poisson regression model for new removed 
    """
    def __init__(
        self,
        verbose: bool = False,
        use_spline: bool = False,
        lam: float = 0.6,
        vaccination: bool = False,
        most_recent_days: int = None,
    ) -> None:
        """
        Args:
            verbose (bool, optional): Whether to print messages on fit. Defaults to False.
            use_spline (bool, optional): Whether to use splines in the GAM model, if false then linear terms are used instead. Defaults to False.
            lam (float, optional): Lambda parameter for regularization. Defaults to 0.6
            vaccination (bool, optional): If to include vaccination data or not. Defaults to False
        """
        self.verbose = verbose
        self.use_spline = use_spline
        self.lam = lam
        self.vaccination = vaccination
        self.most_recent_days = most_recent_days
        return

    def fit(
        self,
        X: pd.DataFrame,
        Y: pd.DataFrame,
    ):
        """
        Fit a poisson regression model each for the cases using active_cases and percentage_susceptible at time t-1, and another model
        for removed using active_cases at time t-1.
        Args:
            X (pd.DataFrame): Dataframe for given region of predictor variables containing columns date, active_cases, percent_susceptible. If including vaccination data then column percent_cvaccine must also be included.
            Y (pd.DataFrame): Dataframe for given region of response variables containing columns cases, removed
        """
        if self.most_recent_days is not None:
            X = X.tail(self.most_recent_days)
            Y = Y.tail(self.most_recent_days)

        self.X_original = X.copy()
        self.Y_original = Y.copy()

        self.N = self.X_original["population"].iloc[-1]

        # Get average proportion of vaccinated individuals over last 2 weeks
        if self.vaccination:
            self.average_perc_vaccinated = X["percent_cvaccine"].tail(
                14).mean()
            self.average_vaccinated = self.average_perc_vaccinated * self.N
        else:
            self.average_perc_vaccinated = 0
            self.average_vaccinated = 0

        # Separate data for each model
        self.X_cases = X[[
            "log_active_cases_yesterday", "log_percent_susceptible_yesterday"
        ]].copy()
        self.Y_cases = Y["cases"]
        self.X_removed = X[["log_active_cases_yesterday"]].copy()
        self.Y_removed = Y["removed"]

        # Setup terms for covid19 data to use in GLM
        term = s if self.use_spline else l
        terms_cases = term(0, lam=self.lam) + term(1, lam=self.lam)
        terms_removed = term(0, lam=self.lam)

        # Model new cases data using infections and percentage susceptible at time t-1
        self.poisson_gam_cases = PoissonGAM(terms_cases, verbose=self.verbose)
        self.poisson_gam_cases.fit(self.X_cases, self.Y_cases)

        # Model removed cases using infections at time t-1
        self.poisson_gam_removed = PoissonGAM(terms_removed,
                                              verbose=self.verbose)
        self.poisson_gam_removed.fit(self.X_removed, self.Y_removed)

        return

    def forecast(self, h: int = 1) -> pd.DataFrame:
        """
        Gives forecasted new cases, active cases, and cumulative number of removed.
        Args:
            h (int, optional): Number of h step predictions to make. Defaults to 1.
        Returns:
            pd.DataFrame: Dataframe containing 1 step predictions for all data in training set along with h step forecasts
        """
        province = self.X_original["province"].iloc[-1]

        # Get 1 step predictions for all values in training set
        cases_preds = self.poisson_gam_cases.predict(self.X_cases)
        removed_preds = self.poisson_gam_removed.predict(self.X_removed)

        # Create result dataframe for training set data
        forecasts = pd.DataFrame({
            "date": self.X_original["date"],
            "province": province,
            "cases_pred": cases_preds,
            "removed_pred": removed_preds,
            "active_cases_pred": np.nan,
            "is_forecast": False,
        })

        # Get h step predictions iteratively. Start with last actual known values of active cases and percent susceptible
        I = self.X_original["active_cases"].iloc[-1]
        S = self.X_original["susceptible"].iloc[-1]
        Z = np.log(self.X_original["percent_susceptible"].iloc[-1])
        date = forecasts["date"].max()

        # Keep track of current forecast to be used to predict next value
        x_cases = self.X_cases.iloc[0, :].copy()
        x_removed = self.X_removed.iloc[0, :].copy()
        x_cases.loc[:] = 0
        x_removed.loc[:] = 0

        # Column names for variables
        col_log_I = "log_active_cases_yesterday"
        col_Z = "log_percent_susceptible_yesterday"

        for _ in range(h):
            date = date + timedelta(days=1)

            # Set current values to previous forecast values and add twitter data
            log_I = np.log(I + 1)
            x_cases.loc[[col_log_I, col_Z]] = [
                log_I,
                Z,
            ]
            x_removed.loc[[col_log_I]] = [log_I]

            # Get predictions and CI for next step
            Y = self.poisson_gam_cases.predict(x_cases.values.reshape(1,
                                                                      -1))[0]
            R = self.poisson_gam_removed.predict(
                x_removed.values.reshape(1, -1))[0]

            # Update next values of I, Z, C
            S = S - Y - self.average_vaccinated
            # C = C + Y
            I = max(I + Y - R, 1)
            Z = np.log(S / self.N)

            # Append predicted value at time t+h
            forecasts = forecasts.append(
                {
                    "date": date,
                    "province": province,
                    "cases_pred": Y,
                    "removed_pred": R,
                    "active_cases_pred": I,
                    "is_forecast": True,
                },
                ignore_index=True,
            )

        # Add cumulative cases and removed predictions
        forecasts = forecasts.assign(
            cumulative_cases_pred=lambda x: x["cases_pred"].cumsum(),
            cumulative_removed_pred=lambda x: x["removed_pred"].cumsum(),
        )

        return forecasts
class StemPoissonTwitterRegressor:
    """
    Space-Time Epidemic model based on "Spatiotemporal Dynamics, Nowcasting and Forecasting of COVID-19 in the United States" (https://arxiv.org/abs/2004.14103)
    Fits two Poisson regression models to model the new cases and new deaths/recovered at time t.
    Also has the option to take in twitter data as extra features in the regression model.

    The first model for the new cases Y_t is modelled using the active cases I_t-1 and number of susceptible people S_t-1
    Y_t \sim Poisson(\mu_t) \\
    log(\mu_t) = \beta_{1t} + \beta_{2t}log(I_{t-1} + 1) + \alpha_tlog(S_{t-1}/N)   

    The second model for the new deaths/recovered \Delta D_t is modelled using the active cases I_t-1
    \Delta D_t \sim Poisson({\mu_t}^D) \\
    log({\mu_t}^D) = \beta_{1t}^D + \beta_{2t}^D log(I_{t-1} + 1)

    Attributes:
        X_original {pandas dataframe} -- Original X dataframe called on fit()
        Y_original {pandas dataframe} -- Original Y dataframe called on fit()
        X_cases {pandas dataframe} -- Transformed X dataframe used for fitting new cases model
        Y_case {pandas dataframe} -- Y dataframe used for fitting new cases model
        X_removed {pandas dataframe} -- Transformed X dataframe used for fitting new removed model
        Y_removed {pandas dataframe} -- Y dataframe used for fitting new removed model
        poisson_gam_cases {PoissonGAM model} -- Poisson regression model for new cases
        poisson_gam_removed {PoissonGAM model} -- Poisson regression model for new removed 
    """
    def __init__(
        self,
        verbose: bool = False,
        use_spline: bool = False,
        lam: float = 0.6,
        twitter_data: pd.DataFrame = None,
        twitter_offset: int = 14,
    ) -> None:
        """
        Args:
            verbose (bool, optional): Whether to print messages on fit. Defaults to False.
            use_spline (bool, optional): Whether to use splines in the GAM model, if false then linear terms are used instead. Defaults to False.
            lam (float, optional): Lambda parameter for regularization. Defaults to 0.6
            twitter_data (pd.DataFrame, optional): Dataframe of additional twitter data
            twitter_offset (int, optional): Allow twitter data to have delayed effect. The offset parameter specifies the number of days in the future the current twitter data will effect. Defaults to 14.
        """
        self.verbose = verbose
        self.use_spline = use_spline
        self.lam = lam
        self.twitter_data = twitter_data
        self.twitter_offset = twitter_offset
        return

    def fit(
        self,
        X: pd.DataFrame,
        Y: pd.DataFrame,
    ):
        """
        Fit a poisson regression model each for the cases using active_cases and percentage_susceptible at time t-1, and another model
        for removed using active_cases at time t-1.
        Args:
            X (pd.DataFrame): Dataframe for given region of predictor variables containing columns date, active_cases, percent_susceptible
            Y (pd.DataFrame): Dataframe for given region of response variables containing columns cases, removed
        """
        # Remove days in data that are after the latest twitter data given
        if self.twitter_data is not None:
            remove_date = self.twitter_data["date"].max()
        else:
            remove_date = X["date"].max()

        X = X.query("date <= @remove_date")
        Y = Y.query("date <= @remove_date")

        self.X_original = X.copy()
        self.Y_original = Y.copy()

        # Separate data for each model
        self.X_cases = X[[
            "date", "log_active_cases_yesterday",
            "log_percent_susceptible_yesterday"
        ]].copy()
        self.Y_cases = Y["cases"]
        self.X_removed = X[["date", "log_active_cases_yesterday"]].copy()
        self.Y_removed = Y["removed"]

        # Preprocess twitter data by shifting it by twitter_offset days so each row contains the twitter data from twitter_offset days ago
        if self.twitter_data is not None:
            twitter_shifted = self.twitter_data.drop(
                ["date", "province"],
                axis=1).shift(periods=self.twitter_offset, axis=0)
            twitter_shifted.columns = [
                f"{col}_shifted" for col in twitter_shifted.columns
            ]
            twitter_shifted = twitter_shifted.assign(
                date=self.twitter_data["date"])

            # Add twitter data to use in both cases and removed models
            self.X_cases = self.X_cases.merge(twitter_shifted,
                                              how="left",
                                              on=["date"])
            self.X_removed = self.X_removed.merge(twitter_shifted,
                                                  how="left",
                                                  on=["date"])

        # Drop date columns not used anymore
        self.X_cases = self.X_cases.drop("date", axis=1)
        self.X_removed = self.X_removed.drop("date", axis=1)

        # Setup terms for covid19 data to use in GLM
        term = s if self.use_spline else l
        terms_cases = term(0, lam=self.lam) + term(1, lam=self.lam)
        terms_removed = term(0, lam=self.lam)

        # Add terms for twitter data
        twitter_cols = self.twitter_data.columns.drop(["date", "province"])
        for i in range(0, len(twitter_cols)):
            terms_cases = terms_cases + term(i + 2, lam=self.lam)
            terms_removed = terms_removed + term(i + 1, lam=self.lam)

        # Model new cases data using infections and percentage susceptible at time t-1
        self.poisson_gam_cases = PoissonGAM(terms_cases, verbose=self.verbose)
        self.poisson_gam_cases.fit(self.X_cases, self.Y_cases)

        # Model removed cases using infections at time t-1
        self.poisson_gam_removed = PoissonGAM(terms_removed,
                                              verbose=self.verbose)
        self.poisson_gam_removed.fit(self.X_removed, self.Y_removed)

        return

    def forecast(self, h: int = 1) -> pd.DataFrame:
        """
        Gives forecasted new cases, active cases, and cumulative number of removed.
        Args:
            h (int, optional): Number of h step predictions to make. Defaults to 1.
        Returns:
            pd.DataFrame: Dataframe containing 1 step predictions for all data in training set along with h step forecasts
        """
        province = self.X_original["province"].iloc[-1]

        # Get 1 step predictions for all values in training set
        cases_preds = self.poisson_gam_cases.predict(self.X_cases)
        removed_preds = self.poisson_gam_removed.predict(self.X_removed)
        # cases_ci = self.poisson_gam_cases.confidence_intervals(self.X_cases)
        # removed_ci = self.poisson_gam_removed.confidence_intervals(self.X_removed)

        # Create result dataframe for training set data
        forecasts = pd.DataFrame({
            "date": self.X_original["date"],
            "province": province,
            "cases_pred": cases_preds,
            "removed_pred": removed_preds,
            "active_cases_pred": np.nan,
            # "cases_ci_lower": cases_ci[:, 0],
            # "cases_ci_upper": cases_ci[:, 1],
            # "removed_ci_lower": removed_ci[:, 0],
            # "removed_ci_upper": removed_ci[:, 1],
            "is_forecast": False,
        })

        # Get h step predictions iteratively. Start with last actual known values of active cases and percent susceptible
        C = self.X_original["cumulative_cases"].iloc[-1]
        N = self.X_original["population"].iloc[-1]
        I = self.X_original["active_cases"].iloc[-1]
        Z = np.log(self.X_original["percent_susceptible"].iloc[-1])
        date = forecasts["date"].max()

        # Keep track of current forecast to be used to predict next value
        x_cases = self.X_cases.iloc[0, :].copy()
        x_removed = self.X_removed.iloc[0, :].copy()
        x_cases.loc[:] = 0
        x_removed.loc[:] = 0

        # Column names for variables
        col_log_I = "log_active_cases_yesterday"
        col_Z = "log_percent_susceptible_yesterday"
        if self.twitter_data is not None:
            twitter_cols = x_cases.index.drop([col_log_I, col_Z]).to_list()
        else:
            twitter_cols = []

        for _ in range(h):
            date = date + timedelta(days=1)

            # Get twitter data corresponding to twitter_offset days ago. If does not exist then just use the most recent twitter data
            if self.twitter_data is not None:
                twitter_date = date - timedelta(days=self.twitter_offset)
                if twitter_date <= self.twitter_data["date"].max():
                    twitter_row = self.twitter_data.query(
                        "date == @twitter_date").iloc[0]
                else:
                    twitter_row = self.twitter_data.iloc[-1]
                twitter_row = twitter_row.drop(["date", "province"])
                twitter_values = twitter_row.values.tolist()
            else:
                twitter_values = []

            # Set current values to previous forecast values and add twitter data
            log_I = np.log(I + 1)
            x_cases.loc[[col_log_I, col_Z] + twitter_cols] = [
                log_I,
                Z,
            ] + twitter_values
            x_removed.loc[[col_log_I] +
                          twitter_cols] = [log_I] + twitter_values

            # Get predictions and CI for next step
            Y = self.poisson_gam_cases.predict(x_cases.values.reshape(1,
                                                                      -1))[0]
            R = self.poisson_gam_removed.predict(
                x_removed.values.reshape(1, -1))[0]
            # Y_ci = self.poisson_gam_cases.confidence_intervals(x_cases)[0]
            # R_ci = self.poisson_gam_removed.confidence_intervals(log_I)[0]

            # Update next values of I, Z, C
            I = max(I + Y - R, 1)
            C = C + Y
            Z = np.log((N - C) / N)

            # Append predicted value at time t+h
            forecasts = forecasts.append(
                {
                    "date": date,
                    "province": province,
                    "cases_pred": Y,
                    "removed_pred": R,
                    "active_cases_pred": I,
                    # "cases_ci_lower": Y_ci[0],
                    # "cases_ci_upper": Y_ci[1],
                    # "removed_ci_lower": R_ci[0],
                    # "removed_ci_upper": R_ci[1],
                    "is_forecast": True,
                },
                ignore_index=True,
            )

        # Add cumulative cases and removed predictions
        forecasts = forecasts.assign(
            cumulative_cases_pred=lambda x: x["cases_pred"].cumsum(),
            cumulative_removed_pred=lambda x: x["removed_pred"].cumsum(),
        )

        return forecasts
    def fit(
        self,
        X: pd.DataFrame,
        Y: pd.DataFrame,
    ):
        """
        Fit a poisson regression model each for the cases using active_cases and percentage_susceptible at time t-1, and another model
        for removed using active_cases at time t-1.
        Args:
            X (pd.DataFrame): Dataframe for given region of predictor variables containing columns date, active_cases, percent_susceptible
            Y (pd.DataFrame): Dataframe for given region of response variables containing columns cases, removed
        """
        self.X_original = X.copy()
        self.Y_original = Y.copy()

        # Separate data for each model
        self.X_cases = X[[
            "date", "log_active_cases_yesterday",
            "log_percent_susceptible_yesterday"
        ]].copy()
        self.Y_cases = Y["cases"]
        self.X_removed = X[["log_active_cases_yesterday"]].copy()
        self.Y_removed = Y["removed"]

        # Add intercept constant
        self.X_cases["intercept"] = 1

        # If time varying parameters then split each column by the date bounds
        if self.date_splits:
            # Keep only dates that are within the dates of df
            self.date_splits = [
                date for date in self.date_splits
                if date < self.X_cases["date"].max()
            ]

            self.X_cases = split_columns_dates(
                df=self.X_cases,
                date_splits=self.date_splits,
                cols=self.cols_date_splits,
                drop_date=True,
            )
        else:
            self.X_cases = self.X_cases.drop("date", axis=1)

        # Setup terms to use in GLM
        term = s if self.use_spline else l
        terms_removed = term(0, lam=self.lam)
        terms_cases = term(0, lam=self.lam)
        for i in range(1, len(self.X_cases.columns)):
            terms_cases = terms_cases + term(i, lam=self.lam)

        # Model new cases data using infections and percentage susceptible at time t-1
        self.poisson_gam_cases = PoissonGAM(terms_cases,
                                            fit_intercept=False,
                                            verbose=self.verbose)
        self.poisson_gam_cases.fit(self.X_cases, self.Y_cases)

        # Model removed cases using infections at time t-1
        self.poisson_gam_removed = PoissonGAM(terms_removed,
                                              verbose=self.verbose)
        self.poisson_gam_removed.fit(self.X_removed, self.Y_removed)

        return
class StemPoissonTimeVaryRegressor:
    """
    Space-Time Epidemic model based on "Spatiotemporal Dynamics, Nowcasting and Forecasting of COVID-19 in the United States" (https://arxiv.org/abs/2004.14103)
    Fits two Poisson regression models to model the new cases and new deaths/recovered at time t.
    Also has option for time varying parameters by date

    The first model for the new cases Y_t is modelled using the active cases I_t-1 and number of susceptible people S_t-1
    Y_t \sim Poisson(\mu_t) \\
    log(\mu_t) = \beta_{1t} + \beta_{2t}log(I_{t-1} + 1) + \alpha_tlog(S_{t-1}/N)   

    The second model for the new deaths/recovered \Delta D_t is modelled using the active cases I_t-1
    \Delta D_t \sim Poisson({\mu_t}^D) \\
    log({\mu_t}^D) = \beta_{1t}^D + \beta_{2t}^D log(I_{t-1} + 1)

    Attributes:
        X_original {pandas dataframe} -- Original X dataframe called on fit()
        Y_original {pandas dataframe} -- Original Y dataframe called on fit()
        X_cases {pandas dataframe} -- Transformed X dataframe used for fitting new cases model
        Y_case {pandas dataframe} -- Y dataframe used for fitting new cases model
        X_removed {pandas dataframe} -- Transformed X dataframe used for fitting new removed model
        Y_removed {pandas dataframe} -- Y dataframe used for fitting new removed model
        poisson_gam_cases {PoissonGAM model} -- Poisson regression model for new cases
        poisson_gam_removed {PoissonGAM model} -- Poisson regression model for new removed 
    """
    def __init__(
        self,
        verbose: bool = False,
        date_splits: List[date] = None,
        cols_date_splits: List[str] = None,
        use_spline: bool = False,
        lam: float = 0.6,
    ) -> None:
        """
        Args:
            verbose (bool, optional): Whether to print messages on fit. Defaults to False.
            date_splits (List[date], optional): List of dates for bounds if want to use time varying parameters. Defaults to None.
            cols_date_splits (List[str], optional): List of columns to allow time varying parameters for. If none then uses all except the intercept. Defaults to None.
            use_spline (bool, optional): Whether to use splines in the GAM model, if false then linear terms are used instead. Defaults to False.
        """
        self.verbose = verbose
        self.date_splits = date_splits
        self.cols_date_splits = (cols_date_splits if cols_date_splits else [
            "log_active_cases_yesterday", "log_percent_susceptible_yesterday"
        ])
        self.use_spline = use_spline
        self.lam = lam
        return

    def fit(
        self,
        X: pd.DataFrame,
        Y: pd.DataFrame,
    ):
        """
        Fit a poisson regression model each for the cases using active_cases and percentage_susceptible at time t-1, and another model
        for removed using active_cases at time t-1.
        Args:
            X (pd.DataFrame): Dataframe for given region of predictor variables containing columns date, active_cases, percent_susceptible
            Y (pd.DataFrame): Dataframe for given region of response variables containing columns cases, removed
        """
        self.X_original = X.copy()
        self.Y_original = Y.copy()

        # Separate data for each model
        self.X_cases = X[[
            "date", "log_active_cases_yesterday",
            "log_percent_susceptible_yesterday"
        ]].copy()
        self.Y_cases = Y["cases"]
        self.X_removed = X[["log_active_cases_yesterday"]].copy()
        self.Y_removed = Y["removed"]

        # Add intercept constant
        self.X_cases["intercept"] = 1

        # If time varying parameters then split each column by the date bounds
        if self.date_splits:
            # Keep only dates that are within the dates of df
            self.date_splits = [
                date for date in self.date_splits
                if date < self.X_cases["date"].max()
            ]

            self.X_cases = split_columns_dates(
                df=self.X_cases,
                date_splits=self.date_splits,
                cols=self.cols_date_splits,
                drop_date=True,
            )
        else:
            self.X_cases = self.X_cases.drop("date", axis=1)

        # Setup terms to use in GLM
        term = s if self.use_spline else l
        terms_removed = term(0, lam=self.lam)
        terms_cases = term(0, lam=self.lam)
        for i in range(1, len(self.X_cases.columns)):
            terms_cases = terms_cases + term(i, lam=self.lam)

        # Model new cases data using infections and percentage susceptible at time t-1
        self.poisson_gam_cases = PoissonGAM(terms_cases,
                                            fit_intercept=False,
                                            verbose=self.verbose)
        self.poisson_gam_cases.fit(self.X_cases, self.Y_cases)

        # Model removed cases using infections at time t-1
        self.poisson_gam_removed = PoissonGAM(terms_removed,
                                              verbose=self.verbose)
        self.poisson_gam_removed.fit(self.X_removed, self.Y_removed)

        return

    def forecast(self, h: int = 1) -> pd.DataFrame:
        """
        Gives forecasted new cases, active cases, and cumulative number of removed.
        Args:
            h (int, optional): Number of h step predictions to make. Defaults to 1.
        Returns:
            pd.DataFrame: Dataframe containing 1 step predictions for all data in training set along with h step forecasts
        """
        province = self.X_original["province"].iloc[-1]

        # Get 1 step predictions for all values in training set
        cases_preds = self.poisson_gam_cases.predict(self.X_cases)
        removed_preds = self.poisson_gam_removed.predict(self.X_removed)
        # cases_ci = self.poisson_gam_cases.confidence_intervals(self.X_cases)
        # removed_ci = self.poisson_gam_removed.confidence_intervals(self.X_removed)

        # Create result dataframe for training set data
        forecasts = pd.DataFrame({
            "date": self.X_original["date"],
            "province": province,
            "cases_pred": cases_preds,
            "removed_pred": removed_preds,
            "active_cases_pred": np.nan,
            # "cases_ci_lower": cases_ci[:, 0],
            # "cases_ci_upper": cases_ci[:, 1],
            # "removed_ci_lower": removed_ci[:, 0],
            # "removed_ci_upper": removed_ci[:, 1],
            "is_forecast": False,
        })

        # Get h step predictions iteratively. Start with last actual known values of active cases and percent susceptible
        C = self.X_original["cumulative_cases"].iloc[-1]
        N = self.X_original["population"].iloc[-1]
        I = self.X_original["active_cases"].iloc[-1]
        Z = np.log(self.X_original["percent_susceptible"].iloc[-1])
        date = forecasts["date"].max()

        # Keep track of current forecast to be used to predict next value
        x_cases = self.X_cases.iloc[0, :].copy()
        p = self.X_cases.shape[1]

        # Set column names for indexing x_cases series when setting new values
        if self.date_splits:
            i = len(self.date_splits) - 1

            if "intercept" in self.cols_date_splits:
                intercept = f"intercept_{i}"
            else:
                intercept = "intercept"

            if "log_active_cases_yesterday" in self.cols_date_splits:
                col_log_I = f"log_active_cases_yesterday_{i}"
            else:
                col_log_I = "log_active_cases_yesterday"

            if "log_percent_susceptible_yesterday" in self.cols_date_splits:
                col_Z = f"log_percent_susceptible_yesterday_{i}"
            else:
                col_Z = "log_percent_susceptible_yesterday"
        else:
            intercept = "intercept"
            col_log_I = "log_active_cases_yesterday"
            col_Z = "log_percent_susceptible_yesterday"

        for _ in range(h):
            # Set current values to previous forecast values
            log_I = np.log(I + 1)
            x_cases.loc[:] = 0
            x_cases.loc[[intercept, col_log_I, col_Z]] = 1, log_I, Z

            # Get predictions and CI for next step
            Y = self.poisson_gam_cases.predict(x_cases.values.reshape(1, p))[0]
            R = self.poisson_gam_removed.predict(log_I)[0]
            # Y_ci = self.poisson_gam_cases.confidence_intervals(x_cases)[0]
            # R_ci = self.poisson_gam_removed.confidence_intervals(log_I)[0]

            # Update next values of I, Z, C
            I = max(I + Y - R, 1)
            C = C + Y
            Z = np.log((N - C) / N)
            date = date + timedelta(days=1)

            # Append predicted value at time t+h
            forecasts = forecasts.append(
                {
                    "date": date,
                    "province": province,
                    "cases_pred": Y,
                    "removed_pred": R,
                    "active_cases_pred": I,
                    # "cases_ci_lower": Y_ci[0],
                    # "cases_ci_upper": Y_ci[1],
                    # "removed_ci_lower": R_ci[0],
                    # "removed_ci_upper": R_ci[1],
                    "is_forecast": True,
                },
                ignore_index=True,
            )

        # Add cumulative cases and removed predictions
        forecasts = forecasts.assign(
            cumulative_cases_pred=lambda x: x["cases_pred"].cumsum(),
            cumulative_removed_pred=lambda x: x["removed_pred"].cumsum(),
        )

        return forecasts
Пример #8
0
############################################################
# https://pygam.readthedocs.io/en/latest/notebooks/tour_of_pygam.html

#Fitting and plotting interactions with te()

from pygam import PoissonGAM, s, te
from pygam.datasets import chicago

X, y = chicago(return_X_y=True)
X.shape

gam = PoissonGAM(s(0, n_splines=200) + te(3, 1) + s(2)).fit(X, y)

import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

plt.ion()
plt.rcParams['figure.figsize'] = (12, 8)

XX = gam.generate_X_grid(term=1, meshgrid=True)
Z = gam.partial_dependence(term=1, X=XX, meshgrid=True)

ax = plt.axes(projection='3d')
ax.plot_surface(XX[0], XX[1], Z, cmap='viridis')

#Simple interactions, copare with te()

from pygam import LinearGAM, s
from pygam.datasets import toy_interaction

X, y = toy_interaction(return_X_y=True)
redwine.describe()

redwine['quality'].value_counts(sort=False)
redwine.hist('quality')

X = redwine.drop('quality', axis=1).values
y = redwine['quality']
feature_names = redwine.columns[:-1]

#build linear and poisson gam

from pygam import PoissonGAM, LinearGAM

lams = np.logspace(-10, 10, 10)

poiss = PoissonGAM().gridsearch(X, y, lam=lams)
poiss.summary()

lin = LinearGAM().gridsearch(X, y, lam=lams)
lin.summary()

plt.figure()
fig, axs = plt.subplots(1, 11, figsize=(40, 8))
for i, ax in enumerate(axs):
    XX = poiss.generate_X_grid(term=i)
    ax.plot(XX[:, i], poiss.partial_dependence(term=i, X=XX))
    ax.plot(XX[:, i],
            poiss.partial_dependence(term=i, X=XX, width=.95)[1],
            c='r',
            ls='--')
    if i == 0: