def model_intercept(data: pd.DataFrame, dep_var: str, prediction: pd.Series, weight_data: pd.DataFrame = None, dep_var_se: str = None, dep_trans_in: Callable[[pd.Series], pd.Series] = lambda x: x, dep_se_trans_in: Callable[[pd.Series], pd.Series] = lambda x: x, dep_trans_out: Callable[[pd.Series], pd.Series] = lambda x: x, verbose: bool = True): data = data.copy() data[dep_var] = dep_trans_in(data[dep_var]) prediction = dep_trans_in(prediction) data = reshape_data_long(data, dep_var) if weight_data is not None: weight_data = reshape_data_long(weight_data, dep_var_se) if (data['date'] != weight_data['date']).any(): raise ValueError( 'Dates in `data` and `weight_data` not identical.') data['se'] = dep_se_trans_in(weight_data[dep_var_se]) else: data['se'] = 1. data = data.set_index('date').sort_index() data[dep_var] = data[dep_var] - prediction data = data.reset_index().dropna() data['intercept'] = 1 mr_data = MRData() mr_data.load_df( data, col_obs=dep_var, col_obs_se='se', col_covs=['intercept'], col_study_id='date', ) intercept_model = LinearCovModel( 'intercept', use_re=False, ) mr_model = MRBRT(mr_data, [intercept_model]) mr_model.fit_model() intercept = mr_model.beta_soln prediction += intercept prediction = dep_trans_out(prediction) return prediction
def __init__(self, t, y, spline_options=None): """Constructor of the SplineFit Args: t (np.ndarray): Independent variable. y (np.ndarray): Dependent variable. spline_options (dict | None, optional): Dictionary of spline prior options. """ self.t = t self.y = y self.spline_options = {} if spline_options is None else spline_options # create mrbrt object df = pd.DataFrame({ 'y': self.y, 'y_se': 1.0/np.exp(self.y), 't': self.t, 'study_id': 1, }) data = MRData( df=df, col_obs='y', col_obs_se='y_se', col_covs=['t'], col_study_id='study_id', add_intercept=True ) intercept = LinearCovModel( alt_cov='intercept', use_re=True, prior_gamma_uniform=np.array([0.0, 0.0]), name='intercept' ) time = LinearCovModel( alt_cov='t', use_re=False, use_spline=True, **self.spline_options, name='time' ) self.mr_model = MRBRT(data, cov_models=[intercept, time]) self.spline = time.create_spline(data) self.spline_coef = None
def fit_model(self, **fit_options): """Fit the model """ self._assert_has_data() beta_init = solve_ls(self.mat, self.data.obs, self.data.obs_se) model = MRBRT(self.data, self.cov_models) gamma_init = np.zeros(model.num_z_vars) default_fit_options = dict( x0=np.hstack((beta_init, gamma_init)), inner_max_iter=500, inner_print_level=5, ) fit_options = {**default_fit_options, **fit_options} model.fit_model(**fit_options) self.soln = model.beta_soln
def predict_time_series( day0: pd.Timestamp, dep_var: str, mr_model: MRBRT, dep_trans_out: Callable[[pd.Series], pd.Series], diff: bool, ) -> pd.DataFrame: data = mr_model.data.to_df() pred_data = MRData() t = np.arange(0, data['t'].max() + 1) pred_data.load_df(pd.DataFrame({'t': t}), col_covs='t') pred_data_value = mr_model.predict(pred_data) if diff: pred_data_value = pred_data_value.cumsum() pred_data_value = dep_trans_out(pred_data_value) pred_data = pd.DataFrame({ 't': t, dep_var: pred_data_value, }) pred_data['date'] = pred_data['t'].apply( lambda x: day0 + pd.Timedelta(days=x)) pred_data = pred_data.set_index('date')[dep_var] return pred_data
def predict(pred_data: pd.DataFrame, hierarchy: pd.DataFrame, mr_model: MRBRT, pred_replace_dict: Dict[str, str], pred_exclude_vars: List[str], dep_var: str, dep_var_se: str, fe_vars: List[str], re_vars: List[str], group_var: str, **kwargs) -> Tuple[pd.DataFrame, pd.DataFrame]: keep_vars = list(pred_replace_dict.keys()) + fe_vars if len(set(keep_vars)) != len(keep_vars): raise ValueError('Duplicate in replace_var + fe_vars.') for replace_var in list(pred_replace_dict.values()): if replace_var in pred_data.columns: del pred_data[replace_var] pred_data = pred_data.rename(columns=pred_replace_dict) pred_data = (pred_data.loc[:, ['location_id', 'date'] + fe_vars].dropna().drop_duplicates()) for pred_exclude_var in pred_exclude_vars: pred_data[pred_exclude_var] = 0 if re_vars: raise ValueError( 'Not propagating random effects (finish `apply_parent_random_effects` method).' ) pred_mr_data = create_mr_data(pred_data, dep_var, dep_var_se, fe_vars, group_var, pred=True) pred = mr_model.predict(pred_mr_data) pred_fe = mr_model.predict(pred_mr_data, predict_for_study=False) pred = pd.concat([ pred_data.loc[:, ['location_id', 'date']], pd.DataFrame({dep_var: pred}) ], axis=1) pred_fe = pd.concat([ pred_data.loc[:, ['location_id', 'date']], pd.DataFrame({dep_var: pred_fe}) ], axis=1) return pred, pred_fe
def run_mr_model(model_data: pd.DataFrame, dep_var: str, dep_var_se: str, fe_vars: List[str], re_vars: List[str], group_var: str, inlier_pct: float = 1., inner_max_iter: int = 1000, outer_max_iter: int = 1000, prior_dict: Dict = None, global_mr_data: MRData = None, **kwargs) -> MRBRT: mr_data = create_mr_data(model_data, dep_var, dep_var_se, fe_vars, group_var) if len(set(re_vars) - set(fe_vars)) > 0: raise ValueError('RE vars must also be FE vars.') if prior_dict is None: prior_dict = {fe_var: {} for fe_var in fe_vars} cov_models = [ LinearCovModel(fe_var, use_re=fe_var in re_vars, **prior_dict[fe_var]) for fe_var in fe_vars ] with suppress_stdout(): mr_model = MRBRT(mr_data, cov_models, inlier_pct=inlier_pct) mr_model.attach_data(global_mr_data) mr_model.fit_model(inner_max_iter=inner_max_iter, outer_max_iter=outer_max_iter) return mr_model
def create_model(self, covs: List[str], prior_type: str = 'Laplace', laplace_std: float = None) -> MRBRT: """Create Gaussian or Laplace model. Args: covs (List[str]): A list of covariates need to be included in the model. prior_type (str): Indicate if use ``Gaussian`` or ``Laplace`` model. laplace_std (float): Standard deviation of the Laplace prior. Default to None. Return: MRBRT: Created model object. """ assert prior_type in ['Laplace', 'Gaussian' ], "Prior type can only 'Laplace' or 'Gaussian'." if prior_type == 'Laplace': assert laplace_std is not None, "Use Laplace prior must provide standard deviation." if prior_type == 'Laplace': cov_models = [ LinearCovModel( cov, use_re=True, prior_beta_laplace=np.array([0.0, laplace_std]) if cov not in self.selected_covs else None, prior_beta_gaussian=None if cov not in self.selected_covs else self.beta_gprior[cov], prior_gamma_uniform=self.loose_gamma_uprior if self.use_re[cov] else self.zero_gamma_uprior) for cov in covs ] else: cov_models = [ LinearCovModel( cov, use_re=True, prior_beta_gaussian=None if cov not in self.beta_gprior else self.beta_gprior[cov], prior_gamma_uniform=self.loose_gamma_uprior if self.use_re[cov] else self.zero_gamma_uprior) for cov in covs ] model = MRBRT(self.data, cov_models=cov_models, inlier_pct=self.inlier_pct) return model
def __init__(self, t, y, spline_options=None, se_power=1.0, space='ln daily', max_iter=50): """Constructor of the SplineFit Args: t (np.ndarray): Independent variable. y (np.ndarray): Dependent variable. spline_options (dict | None, optional): Dictionary of spline prior options. se_power (float): A number between 0 and 1 that scale the standard error. space (str): Which space is the spline fitting, assume y is daily cases. max_iter (int): Maximum number of iteration. """ self.space = space assert self.space in ['daily', 'ln daily', 'cumul', 'ln cumul'], "spline_space must be one of 'daily'," \ " 'ln daily', 'cumul', 'ln cumul' space." if self.space == 'ln daily': self.t = t[y > 0.0] self.y = np.log(y[y > 0.0]) elif self.space == 'daily': self.t = t self.y = y elif self.space == 'ln cumul': y = np.cumsum(y) self.t = t[y > 0.0] self.y = np.log(y[y > 0.0]) else: self.t = t self.y = np.cumsum(y) self.spline_options = {} if spline_options is None else spline_options self.se_power = se_power assert 0 <= self.se_power <= 1, "spline se_power has to be between 0 and 1." if self.se_power == 0: y_se = np.ones(self.t.size) else: y_se = 1.0 / np.exp(self.y)**self.se_power # create mrbrt object df = pd.DataFrame({ 'y': self.y, 'y_se': y_se, 't': self.t, 'study_id': 1, }) data = MRData(df=df, col_obs='y', col_obs_se='y_se', col_covs=['t'], col_study_id='study_id', add_intercept=True) intercept = LinearCovModel(alt_cov='intercept', use_re=True, prior_gamma_uniform=np.array([0.0, 0.0]), name='intercept') time = LinearCovModel(alt_cov='t', use_re=False, use_spline=True, **self.spline_options, name='time') self.mr_model = MRBRT(data, cov_models=[intercept, time]) self.spline = time.create_spline(data) self.spline_coef = None self.max_iter = max_iter
class SplineFit: """Spline fit class """ def __init__(self, t, y, spline_options=None): """Constructor of the SplineFit Args: t (np.ndarray): Independent variable. y (np.ndarray): Dependent variable. spline_options (dict | None, optional): Dictionary of spline prior options. """ self.t = t self.y = y self.spline_options = {} if spline_options is None else spline_options # create mrbrt object df = pd.DataFrame({ 'y': self.y, 'y_se': 1.0/np.exp(self.y), 't': self.t, 'study_id': 1, }) data = MRData( df=df, col_obs='y', col_obs_se='y_se', col_covs=['t'], col_study_id='study_id', add_intercept=True ) intercept = LinearCovModel( alt_cov='intercept', use_re=True, prior_gamma_uniform=np.array([0.0, 0.0]), name='intercept' ) time = LinearCovModel( alt_cov='t', use_re=False, use_spline=True, **self.spline_options, name='time' ) self.mr_model = MRBRT(data, cov_models=[intercept, time]) self.spline = time.create_spline(data) self.spline_coef = None def fit_spline(self): """Fit the spline. """ self.mr_model.fit_model(inner_max_iter=30) self.spline_coef = self.mr_model.beta_soln self.spline_coef[1:] += self.spline_coef[0] def predict(self, t): """Predict the dependent variable, given independent variable. """ mat = self.spline.design_mat(t) return mat.dot(self.spline_coef)
def estimate_time_series( data: pd.DataFrame, spline_options: Dict, n_knots: int, dep_var: str, dep_trans_in: Callable[[pd.Series], pd.Series] = lambda x: x, weight_data: pd.DataFrame = None, dep_var_se: str = None, dep_se_trans_in: Callable[[pd.Series], pd.Series] = lambda x: x, diff: bool = False, num_submodels: int = 25, single_random_knot: bool = False, min_interval_days: int = 7, dep_trans_out: Callable[[pd.Series], pd.Series] = lambda x: x, split_l_interval: bool = False, split_r_interval: bool = False, verbose: bool = False, ) -> Tuple[pd.DataFrame, pd.Series, MRBeRT]: if verbose: logger.info('Formatting data.') data = data.copy() data[dep_var] = dep_trans_in(data[dep_var]) if diff: if verbose: logger.info( 'For diff model, drop day1 (i.e., if day0 is > 0, day0->day1 diff would be hugely negative).' ) data[dep_var] = data[dep_var].diff() data[dep_var] = data[dep_var][data[dep_var].diff().notnull()] if data[[dep_var]].shape[1] > 1: reshape = True data = reshape_data_long(data, dep_var) if weight_data is not None: weight_data = reshape_data_long(weight_data, dep_var_se) else: reshape = False if weight_data is not None: if (data['date'] != weight_data['date']).any(): raise ValueError( 'Dates in `data` and `weight_data` not identical.') data['se'] = dep_se_trans_in(weight_data[dep_var_se]) else: data['se'] = 1. data = data.rename(columns={dep_var: 'y'}) day0 = data['date'].min() keep_vars = ['date', 'y', 'se'] data = data.loc[:, keep_vars] start_len = len(data) data = data.dropna() end_len = len(data) if start_len != end_len and not reshape: if verbose: logger.debug('NAs in data') data['t'] = (data['date'] - day0).dt.days col_args = { 'col_obs': 'y', 'col_obs_se': 'se', 'col_covs': ['t'], #'col_study_id':'date', } if verbose: logger.info('Getting base knots.') min_interval = min_interval_days / data['t'].max() if num_submodels == 1 and single_random_knot: spline_knots = get_ensemble_knots(n_knots, min_interval, 1)[0] else: spline_knots = np.linspace(0., 1., n_knots) if split_l_interval or split_r_interval: if num_submodels > 1: raise ValueError( 'Would need to set up functionality to split segments for ensemble.' ) if split_l_interval: n_knots += 1 spline_knots = np.insert(spline_knots, 0, spline_knots[:2].mean()) if split_r_interval: n_knots += 1 spline_knots = np.insert(spline_knots, -1, spline_knots[-2:].mean()) if verbose: logger.info('Creating model data.') mr_data = MRData() mr_data.load_df(data, **col_args) spline_model = LinearCovModel('t', use_re=False, use_spline=True, use_spline_intercept=True, spline_knots=spline_knots, **spline_options) if num_submodels > 1: if verbose: logger.info('Sampling knots.') ensemble_knots = get_ensemble_knots(n_knots, min_interval, num_submodels) if verbose: logger.info('Initializing model.') mr_model = MRBeRT(mr_data, spline_model, ensemble_knots) else: if verbose: logger.info('Initializing model.') mr_model = MRBRT(mr_data, [spline_model]) if verbose: logger.info('Fitting model.') mr_model.fit_model() if num_submodels > 1: if verbose: logger.info('Scoring submodels.') mr_model.score_model() data = data.set_index('date')[['y', 'se']] if verbose: logger.info('Making prediction.') smooth_data = predict_time_series( day0=day0, dep_var=dep_var, mr_model=mr_model, dep_trans_out=dep_trans_out, diff=diff, ) return data, smooth_data, mr_model