def setup_pipeline(self): """ Sets up the pipeline for running predictive validity and forecasting data out. Should be run at the end of the inheriting class' init so that the self.generate() gets the model settings to be run for all models. """ self.pv = PVModel(data=self.all_data, col_t=self.col_t, col_group=self.col_group, col_obs=self.col_obs, col_obs_compare=self.col_obs_compare, predict_space=self.predict_space, model_generator=self.generate()) self.forecaster = Forecaster()
class ModelPipeline: """ Base class for a model generator. If a model needs to have initial parameters started for the predictive validity, put that in run_init_model """ def __init__(self, all_data, col_t, col_obs, col_group, col_obs_compare, all_cov_names, fun, predict_space, obs_se_func=None): """ Base class for a model pipeline. At minimum needs the following arguments for a model pipeline. Args: all_data: (pd.DataFrame) of *all* the data that will go into this modeling pipeline col_t: (str) name of the column with time col_group: (str) name of the column with the group in it col_obs: (str) the name of the column with observations for fitting the model col_obs_compare: (str) the name of the column that will be used for predictive validity comparison all_cov_names: List[str] list of name(s) of covariate(s). Not the same as the covariate specifications that are required by CurveModel in order of parameters. You should exclude intercept from this list. fun: (callable) the space to fit in, one of curvefit.functions predict_space: (callable) the space to do predictive validity in, one of curvefit.functions obs_se_func: (optional) function to get observation standard error from col_t Attributes: self.pv: (curvefit.pv.PVModel) predictive validity model self.forecaster: (curvefit.forecaster.Forecaster) residual forecasting tool self.mean_predictions: (dict) dictionary of mean predictions keyed by group self.simulated_data: (dict) dictionary of simulated datasets keyed by group self.draws: (dict) dictionary of resulting keyed by group """ self.all_data = all_data self.col_t = col_t self.col_group = col_group self.col_obs = col_obs self.col_obs_compare = col_obs_compare self.all_cov_names = all_cov_names self.fun = fun self.predict_space = predict_space self.obs_se_func = obs_se_func if self.obs_se_func is not None: self.col_obs_se = 'obs_se' self.all_data[self.col_obs_se] = self.all_data[self.col_t].apply( self.obs_se_func) else: self.col_obs_se = None # these are the attributes that can't be used to initialize a # CurveModel but are needed to initialize the ModelPipeline self.pop_cols = [ 'all_data', 'all_cov_names', 'col_obs_compare', 'predict_space', 'obs_se_func' ] self.all_data.sort_values([col_group, col_t], inplace=True) self.groups = sorted(self.all_data[self.col_group].unique()) self.pv = None self.forecaster = None self.mean_predictions = None self.simulated_data = None self.draws = None self.draw_models = None def setup_pipeline(self): """ Sets up the pipeline for running predictive validity and forecasting data out. Should be run at the end of the inheriting class' init so that the self.generate() gets the model settings to be run for all models. """ self.pv = PVModel(data=self.all_data, col_t=self.col_t, col_group=self.col_group, col_obs=self.col_obs, col_obs_compare=self.col_obs_compare, predict_space=self.predict_space, model_generator=self.generate()) self.forecaster = Forecaster() def run_init_model(self): """ Runs the model that doesn't need to be run multiple times. """ self.refresh() def refresh(self): """ Clear the current model results. """ pass def generate(self): """ Generate a copy of this class. """ return deepcopy(self) def fit(self, df, group=None): """ Function to fit the model with a given data frame. Args: df: (pd.DataFrame) group: (str) optional group to use in whatever capacity is needed for calling this function """ pass def predict(self, times, predict_space, predict_group): """ Function to create predictions based on the model fit. Args: times: (np.array) of times to predict at predict_space: (callable) curvefit.functions function to predict in that space predict_group: which group to make predictions for """ pass def run_predictive_validity(self, theta): """ Run predictive validity for the full model. Args: theta: amount of scaling for residuals relative to prediction. """ self.pv.run_pv(theta=theta) def fit_residuals(self, smoothed_radius, exclude_below, exclude_groups): """ Fits residuals given a smoothed radius, and some models to exclude. Exclude below excludes models with less than that many data points. Exclude groups excludes all models from the list of groups regardless of the data points. Args: smoothed_radius: List[int] 2-element list of amount of smoothing for the residuals exclude_groups: List[str] which groups to exclude from the residual analysis exclude_below: (int) observations with less than exclude_below will be excluded from the analysis Returns: """ residual_data = self.pv.get_smoothed_residuals(radius=smoothed_radius) residual_data = residual_data.loc[ residual_data['num_data'] > exclude_below].copy() residual_data = residual_data.loc[~residual_data[self.col_group]. isin(exclude_groups)].copy() self.forecaster.fit_residuals( residual_data=residual_data, mean_col='residual_mean', std_col='residual_std', residual_covariates=['far_out', 'num_data'], residual_model_type='linear') def create_draws(self, num_draws, num_forecast_out, prediction_times, theta=1, std_threshold=1e-2): """ Generate draws for a model pipeline, smoothing over a neighbor radius of residuals for far out and num data points. Args: num_draws: (int) the number of draws to take num_forecast_out: (int) how far out into the future should residual simulations be taken prediction_times: (int) which times to produce final predictions at std_threshold: (float) floor for standard deviation theta: (float) between 0 and 1, how much scaling of the residuals to do relative to the prediction mean """ if self.pv.all_residuals is None: raise RuntimeError( "Need to first run predictive validity with self.run_predictive_validity." ) generator = self.generate() self.mean_predictions = {} self.simulated_data = {} self.draws = {} self.fit(df=self.all_data) for group in self.groups: sims = self.forecaster.simulate(mp=self, far_out=num_forecast_out, num_simulations=num_draws, group=group, theta=theta, epsilon=std_threshold) self.simulated_data[group] = sims self.mean_predictions[group] = self.predict( times=prediction_times, predict_space=self.predict_space, predict_group=group) for group in self.groups: self.draws[group] = [] for i in range(num_draws): new_data = [] for group in self.groups: new_data.append(self.simulated_data[group][i]) new_data = pd.concat(new_data) print(f"Creating {i}th draw.", end='\r') generator.fit(df=new_data) for group in self.groups: predictions = generator.predict( times=prediction_times, predict_space=self.predict_space, predict_group=group) self.draws[group].append(predictions) return self
class ModelPipeline: """ Base class for a model generator. If a model needs to have initial parameters started for the predictive validity, put that in run_init_model """ def __init__(self, all_data, col_t, col_obs, col_group, col_obs_compare, all_cov_names, fun, predict_space, obs_se_func=None): """ Base class for a model pipeline. At minimum needs the following arguments for a model pipeline. Args: all_data: (pd.DataFrame) of *all* the data that will go into this modeling pipeline col_t: (str) name of the column with time col_group: (str) name of the column with the group in it col_obs: (str) the name of the column with observations for fitting the model col_obs_compare: (str) the name of the column that will be used for predictive validity comparison all_cov_names: List[str] list of name(s) of covariate(s). Not the same as the covariate specifications that are required by CurveModel in order of parameters. You should exclude intercept from this list. fun: (callable) the space to fit in, one of curvefit.functions predict_space: (callable) the space to do predictive validity in, one of curvefit.functions obs_se_func: (optional) function to get observation standard error from col_t Attributes: self.pv: (curvefit.pv.PVModel) predictive validity model self.forecaster: (curvefit.forecaster.Forecaster) residual forecasting tool self.mean_predictions: (dict) dictionary of mean predictions keyed by group self.simulated_data: (dict) dictionary of simulated datasets keyed by group self.draws: (dict) dictionary of resulting keyed by group """ self.all_data = all_data self.col_t = col_t self.col_group = col_group self.col_obs = col_obs self.col_obs_compare = col_obs_compare self.all_cov_names = all_cov_names self.fun = fun self.predict_space = predict_space self.obs_se_func = obs_se_func if self.obs_se_func is not None: self.col_obs_se = 'obs_se' self.all_data[self.col_obs_se] = self.all_data[self.col_t].apply( self.obs_se_func) else: self.col_obs_se = None # these are the attributes that can't be used to initialize a # CurveModel but are needed to initialize the ModelPipeline self.pop_cols = [ 'all_data', 'all_cov_names', 'col_obs_compare', 'predict_space', 'obs_se_func' ] self.all_data.sort_values([col_group, col_t], inplace=True) self.groups = sorted(self.all_data[self.col_group].unique()) self.pv = None self.forecaster = None self.mean_predictions = None self.simulated_data = None self.draws = None self.draw_models = None def run(self, n_draws, prediction_times, cv_threshold, smoothed_radius, exclude_below): """ Runs the whole model with PV and forecasting residuals and creating draws. Args: n_draws: (int) number of draws to produce prediction_times: (np.array) array of times to make predictions at cv_threshold: (float) lower bound on the coefficient of variation for the residuals simulation smoothed_radius: List[int] residual smoothing before running the residual forecast -- how many neighbors to look at, e.g. [3, 3] would smooth over a radius of 3 exclude_below: (int) exclude results from the predictive validity analysis that had less than this many data points -- just for going into the regression to predict the coefficient of variation (low numbers of data points makes this unstable) Returns: """ assert type(n_draws) == int assert type(cv_threshold) == float assert type(smoothed_radius) == list assert type(exclude_below) == int # Setup the initial model (optional for some subclasses) self.run_init_model() # Run predictive validity with a theta = 1, means everything is in relative space # -- relative mean bias, relative standard deviation (coefficient of variation) self.run_predictive_validity(theta=1) # Excludes Wuhan from the residual fitting. # Right now only std_covariates are used. self.fit_residuals(smoothed_radius=smoothed_radius, exclude_below=exclude_below, mean_covariates=['num_data_transformed', 'far_out'], std_covariates=['log_num_data_transformed'], exclude_groups=['Wuhan City, Hubei']) # Create draws. Access them in self.draws by location. self.create_draws(num_draws=n_draws, std_threshold=cv_threshold, prediction_times=prediction_times, theta=1) def setup_pipeline(self): """ Sets up the pipeline for running predictive validity and forecasting data out. Should be run at the end of the inheriting class' init so that the self.generate() gets the model settings to be run for all models. """ self.pv = PVModel(data=self.all_data, col_t=self.col_t, col_group=self.col_group, col_obs=self.col_obs, col_obs_compare=self.col_obs_compare, predict_space=self.predict_space, model_generator=self.generate()) self.forecaster = Forecaster() def run_init_model(self): """ Runs the model that doesn't need to be run multiple times. """ self.refresh() def refresh(self): """ Clear the current model results. """ pass def generate(self): """ Generate a copy of this class. """ return deepcopy(self) def fit(self, df, group=None): """ Function to fit the model with a given data frame. Args: df: (pd.DataFrame) group: (str) optional group to use in whatever capacity is needed for calling this function """ pass def predict(self, times, predict_space, predict_group): """ Function to create predictions based on the model fit. Args: times: (np.array) of times to predict at predict_space: (callable) curvefit.functions function to predict in that space predict_group: which group to make predictions for """ pass def run_predictive_validity(self, theta): """ Run predictive validity for the full model. Args: theta: amount of scaling for residuals relative to prediction. """ self.pv.run_pv(theta=theta) def fit_residuals(self, smoothed_radius, mean_covariates, std_covariates, exclude_below, exclude_groups, std_floor=1e-5): """ Fits residuals given a smoothed radius, and some models to exclude. Exclude below excludes models with less than that many data points. Exclude groups excludes all models from the list of groups regardless of the data points. Args: smoothed_radius: List[int] 2-element list of amount of smoothing for the residuals mean_covariates: List[str] which covariates to use to predict the residuals choices of num_data, far_out, and data_index (where data_index = far_out + num_data) std_covariates: List[str] which covariates to use to predict the coefficient of variation in the residuals exclude_groups: List[str] which groups to exclude from the residual analysis exclude_below: (int) observations with less than exclude_below will be excluded from the analysis std_floor: (float) minimum standard deviation (or coefficient of variation given theta) for the regression inputs Returns: """ residual_data = self.pv.get_smoothed_residuals(radius=smoothed_radius) residual_data = residual_data.loc[ residual_data['num_data'] > exclude_below].copy() residual_data = residual_data.loc[~residual_data[self.col_group]. isin(exclude_groups)].copy() residual_data['residual_std'] = residual_data['residual_std'].apply( lambda x: max(x, std_floor)) self.forecaster.fit_residuals(residual_data=residual_data, mean_col='residual_mean', std_col='residual_std', mean_covariates=mean_covariates, std_covariates=std_covariates, residual_model_type='linear') def create_draws(self, num_draws, prediction_times, theta=1, std_threshold=1e-2): """ Generate draws for a model pipeline, smoothing over a neighbor radius of residuals for far out and num data points. Args: num_draws: (int) the number of draws to take prediction_times: (int) which times to produce final predictions (draws) at std_threshold: (float) floor for standard deviation theta: (float) between 0 and 1, how much scaling of the residuals to do relative to the prediction mean """ if self.pv.all_residuals is None: raise RuntimeError( "Need to first run predictive validity with self.run_predictive_validity." ) # Get the best fit we can self.fit(df=self.all_data) self.mean_predictions = {} self.draws = {} for group in self.groups: # Get the mean prediction for each group self.mean_predictions[group] = self.predict( times=prediction_times, predict_space=self.predict_space, predict_group=group) # Loop through each group, forecasting the residuals and making draws for group in self.groups: draws = self.forecaster.simulate(mp=self, num_simulations=num_draws, prediction_times=prediction_times, group=group, theta=theta, epsilon=std_threshold) self.draws[group] = draws return self def plot_draws(self, prediction_times, sharex, sharey): plot_uncertainty(generator=self, sharex=sharex, sharey=sharey, prediction_times=prediction_times)