def test_remove_nan_in_covs(df, covs): df.loc[:0, covs] = np.nan d = MRData() with pytest.warns(Warning): d.load_df(df, col_obs='obs', col_obs_se='obs_se', col_covs=covs) assert d.num_obs == df.shape[0] - 1
def test_covs(df, covs): d = MRData() d.load_df(df, col_obs='obs', col_obs_se='obs_se', col_covs=covs) num_covs = 0 if covs is None else len(covs) num_covs += 1 assert d.num_covs == num_covs
def predict_time_series( day0: pd.Timestamp, dep_var: str, mr_model: MRBRT, dep_trans_out: Callable[[pd.Series], pd.Series], diff: bool, ) -> pd.DataFrame: data = mr_model.data.to_df() pred_data = MRData() t = np.arange(0, data['t'].max() + 1) pred_data.load_df(pd.DataFrame({'t': t}), col_covs='t') pred_data_value = mr_model.predict(pred_data) if diff: pred_data_value = pred_data_value.cumsum() pred_data_value = dep_trans_out(pred_data_value) pred_data = pd.DataFrame({ 't': t, dep_var: pred_data_value, }) pred_data['date'] = pred_data['t'].apply( lambda x: day0 + pd.Timedelta(days=x)) pred_data = pred_data.set_index('date')[dep_var] return pred_data
def test_assert_has_covs(df): d = MRData() d.load_df(df, col_obs='obs', col_obs_se='obs_se', col_covs=['cov0', 'cov1', 'cov2']) with pytest.raises(ValueError): d._assert_has_covs('cov3')
def data(df): df['study_id'] = np.array([0, 0, 1, 1, 2]) d = MRData() d.load_df(df, col_obs='obs', col_obs_se='obs_se', col_covs=[f'cov{i}' for i in range(3)], col_study_id='study_id') return d
def test_normalize_covs(df, covs): d = MRData() d.load_df(df, col_obs='obs', col_obs_se='obs_se', col_covs=['cov0', 'cov1', 'cov2']) d.normalize_covs(covs) assert d.is_cov_normalized(covs)
def test_has_covs(df): d = MRData() d.load_df(df, col_obs='obs', col_obs_se='obs_se', col_covs=['cov0', 'cov1', 'cov2']) assert d.has_covs(['cov0']) assert d.has_covs(['cov0', 'cov1']) assert not d.has_covs(['cov3'])
def test_obs(df, obs, obs_se): d = MRData() d.load_df(df, col_obs=obs, col_obs_se=obs_se, col_covs=['cov0', 'cov1', 'cov2']) assert d.obs.size == df.shape[0] assert d.obs_se.size == df.shape[0] if obs is None: assert all(np.isnan(d.obs))
def test_is_empty(df): d = MRData() assert d.is_empty() d.load_df(df, col_obs='obs', col_obs_se='obs_se', col_covs=['cov0', 'cov1', 'cov2']) assert not d.is_empty() d.reset() assert d.is_empty()
def test_get_covs(df): d = MRData() d.load_df(df, col_obs='obs', col_obs_se='obs_se', col_covs=['cov0', 'cov1', 'cov2']) for cov_name in ['cov0', 'cov1', 'cov2']: assert np.allclose(d.get_covs(cov_name), df[cov_name].to_numpy()[:, None]) cov_mat = d.get_covs(['cov0', 'cov1', 'cov2']) assert np.allclose(cov_mat, df[['cov0', 'cov1', 'cov2']].to_numpy())
def model_intercept(data: pd.DataFrame, dep_var: str, prediction: pd.Series, weight_data: pd.DataFrame = None, dep_var_se: str = None, dep_trans_in: Callable[[pd.Series], pd.Series] = lambda x: x, dep_se_trans_in: Callable[[pd.Series], pd.Series] = lambda x: x, dep_trans_out: Callable[[pd.Series], pd.Series] = lambda x: x, verbose: bool = True): data = data.copy() data[dep_var] = dep_trans_in(data[dep_var]) prediction = dep_trans_in(prediction) data = reshape_data_long(data, dep_var) if weight_data is not None: weight_data = reshape_data_long(weight_data, dep_var_se) if (data['date'] != weight_data['date']).any(): raise ValueError( 'Dates in `data` and `weight_data` not identical.') data['se'] = dep_se_trans_in(weight_data[dep_var_se]) else: data['se'] = 1. data = data.set_index('date').sort_index() data[dep_var] = data[dep_var] - prediction data = data.reset_index().dropna() data['intercept'] = 1 mr_data = MRData() mr_data.load_df( data, col_obs=dep_var, col_obs_se='se', col_covs=['intercept'], col_study_id='date', ) intercept_model = LinearCovModel( 'intercept', use_re=False, ) mr_model = MRBRT(mr_data, [intercept_model]) mr_model.fit_model() intercept = mr_model.beta_soln prediction += intercept prediction = dep_trans_out(prediction) return prediction
def mrdata(seed=123): np.random.seed(seed) data = pd.DataFrame({ 'obs': np.random.randn(10), 'obs_se': np.full(10, 0.1), 'cov0': np.ones(10), 'cov1': np.random.randn(10), 'study_id': np.random.choice(range(3), 10) }) mrdata = MRData() mrdata.load_df(data, col_obs='obs', col_obs_se='obs_se', col_covs=['cov0', 'cov1'], col_study_id='study_id') return mrdata
def test_data_id(df, study_id): if study_id is not None: df['study_id'] = study_id col_study_id = 'study_id' else: col_study_id = None d = MRData() d.load_df(df, col_obs='obs', col_obs_se='obs_se', col_covs=['cov0', 'cov1', 'cov2'], col_study_id=col_study_id) d._sort_by_data_id() assert np.allclose(d.obs, df['obs']) assert np.allclose(d.obs_se, df['obs_se']) for i in range(3): assert np.allclose(d.covs[f'cov{i}'], df[f'cov{i}'])
def test_study_id(df, study_id): if study_id is not None: df['study_id'] = study_id col_study_id = 'study_id' else: col_study_id = None d = MRData() d.load_df(df, col_obs='obs', col_obs_se='obs_se', col_covs=['cov0', 'cov1', 'cov2'], col_study_id=col_study_id) if col_study_id is None: assert np.all(d.study_id == 'Unknown') assert d.num_studies == 1 assert d.studies[0] == 'Unknown' else: assert np.allclose(d.study_id, np.array([0, 0, 1, 1, 2])) assert d.num_studies == 3 assert np.allclose(d.studies, np.array([0, 1, 2])) assert np.allclose(d.study_sizes, np.array([2, 2, 1]))
def plot_risk_function(mrbrt, pair, beta_samples, gamma_samples, alt_cov_names=None, ref_cov_names=None, continuous_variables=[], plot_note=None, plots_dir=None, write_file=False): """Plot predicted relative risk. Args: mrbrt (mrtool.MRBRT): MRBeRT object. pair (str): risk_outcome pair. eg. 'redmeat_colorectal' beta_samples (np.ndarray): Beta samples generated using `sample_soln` function in MRBRT gamma_samples (np.ndarray): Gamma samples generated using `sample_soln` function in MRBRT alt_cov_names (List[str], optional): Name of the alternative exposures, if `None` use `['b_0', 'b_1']`. Default to `None`. ref_cov_names (List[str], optional): Name of the reference exposures, if `None` use `['a_0', 'a_1']`. Default to `None`. continuous_variables (list): List of continuous covariate names. plot_note (str): The notes intended to be written on the title. plots_dir (str): Directory where to save the plot. write_file (bool): Specify `True` if the plot is expected to be saved on disk. If True, `plots_dir` should be specified too. """ data_df = mrbrt.data.to_df() sub = mrbrt.sub_models[0] knots = sub.get_cov_model(mrbrt.ensemble_cov_model_name).spline.knots min_cov = knots[0] max_cov = knots[-1] dose_grid = np.linspace(min_cov, max_cov) col_covs = sub.cov_names pred_df = pd.DataFrame(dict(zip(col_covs, np.zeros(len(col_covs)))), index=np.arange(len(dose_grid))) alt_cov_names = ['b_0', 'b_1'] if alt_cov_names is None else alt_cov_names ref_cov_names = ['a_0', 'a_1'] if ref_cov_names is None else ref_cov_names pred_df['intercept'] = 1 pred_df[alt_cov_names[0]] = dose_grid pred_df[alt_cov_names[1]] = dose_grid pred_df[ref_cov_names[0]] = knots[0] pred_df[ref_cov_names[1]] = knots[0] # if it's continuous variables, take median for var in continuous_variables: pred_df[var] = np.median(data_df[var]) pred_data = MRData() pred_data.load_df(pred_df, col_covs=col_covs) y_draws = mrbrt.create_draws(pred_data, beta_samples, gamma_samples, random_study=True) y_draws_fe = mrbrt.create_draws(pred_data, beta_samples, gamma_samples, random_study=False) num_samples = y_draws_fe.shape[1] sort_index = np.argsort(y_draws_fe[-1]) trimmed_draws = y_draws_fe[:, sort_index[int(num_samples*0.01): -int(num_samples*0.01)]] patch_index = np.random.choice(trimmed_draws.shape[1], y_draws_fe.shape[1] - trimmed_draws.shape[1], replace=True) y_draws_fe = np.hstack((trimmed_draws, trimmed_draws[:, patch_index])) y_mean_fe = np.mean(y_draws_fe, axis=1) y_lower_fe = np.percentile(y_draws_fe, 2.5, axis=1) y_upper_fe = np.percentile(y_draws_fe, 97.5, axis=1) plt.rcParams['axes.edgecolor'] = '0.15' plt.rcParams['axes.linewidth'] = 0.5 plt.plot(dose_grid, np.exp(y_lower_fe), c='gray') plt.plot(dose_grid, np.exp(y_upper_fe), c='gray') plt.plot(dose_grid, np.exp(y_mean_fe), c='red') plt.ylim([np.exp(y_lower_fe).min() - np.exp(y_mean_fe).ptp()*0.1, np.exp(y_upper_fe).max() + np.exp(y_mean_fe).ptp()*0.1]) plt.ylabel('RR', fontsize=10) plt.xlabel("Exposure", fontsize=10) if plot_note is not None: plt.title(plot_note) # save plot if write_file: assert plots_dir is not None, "plots_dir is not specified!" outfile = os.path.join(plots_dir, f'{pair}_risk_function.pdf') plt.savefig(outfile, bbox_inches='tight') print(f"Risk function plot saved at {outfile}") else: plt.show() plt.close()
def estimate_time_series( data: pd.DataFrame, spline_options: Dict, n_knots: int, dep_var: str, dep_trans_in: Callable[[pd.Series], pd.Series] = lambda x: x, weight_data: pd.DataFrame = None, dep_var_se: str = None, dep_se_trans_in: Callable[[pd.Series], pd.Series] = lambda x: x, diff: bool = False, num_submodels: int = 25, single_random_knot: bool = False, min_interval_days: int = 7, dep_trans_out: Callable[[pd.Series], pd.Series] = lambda x: x, split_l_interval: bool = False, split_r_interval: bool = False, verbose: bool = False, ) -> Tuple[pd.DataFrame, pd.Series, MRBeRT]: if verbose: logger.info('Formatting data.') data = data.copy() data[dep_var] = dep_trans_in(data[dep_var]) if diff: if verbose: logger.info( 'For diff model, drop day1 (i.e., if day0 is > 0, day0->day1 diff would be hugely negative).' ) data[dep_var] = data[dep_var].diff() data[dep_var] = data[dep_var][data[dep_var].diff().notnull()] if data[[dep_var]].shape[1] > 1: reshape = True data = reshape_data_long(data, dep_var) if weight_data is not None: weight_data = reshape_data_long(weight_data, dep_var_se) else: reshape = False if weight_data is not None: if (data['date'] != weight_data['date']).any(): raise ValueError( 'Dates in `data` and `weight_data` not identical.') data['se'] = dep_se_trans_in(weight_data[dep_var_se]) else: data['se'] = 1. data = data.rename(columns={dep_var: 'y'}) day0 = data['date'].min() keep_vars = ['date', 'y', 'se'] data = data.loc[:, keep_vars] start_len = len(data) data = data.dropna() end_len = len(data) if start_len != end_len and not reshape: if verbose: logger.debug('NAs in data') data['t'] = (data['date'] - day0).dt.days col_args = { 'col_obs': 'y', 'col_obs_se': 'se', 'col_covs': ['t'], #'col_study_id':'date', } if verbose: logger.info('Getting base knots.') min_interval = min_interval_days / data['t'].max() if num_submodels == 1 and single_random_knot: spline_knots = get_ensemble_knots(n_knots, min_interval, 1)[0] else: spline_knots = np.linspace(0., 1., n_knots) if split_l_interval or split_r_interval: if num_submodels > 1: raise ValueError( 'Would need to set up functionality to split segments for ensemble.' ) if split_l_interval: n_knots += 1 spline_knots = np.insert(spline_knots, 0, spline_knots[:2].mean()) if split_r_interval: n_knots += 1 spline_knots = np.insert(spline_knots, -1, spline_knots[-2:].mean()) if verbose: logger.info('Creating model data.') mr_data = MRData() mr_data.load_df(data, **col_args) spline_model = LinearCovModel('t', use_re=False, use_spline=True, use_spline_intercept=True, spline_knots=spline_knots, **spline_options) if num_submodels > 1: if verbose: logger.info('Sampling knots.') ensemble_knots = get_ensemble_knots(n_knots, min_interval, num_submodels) if verbose: logger.info('Initializing model.') mr_model = MRBeRT(mr_data, spline_model, ensemble_knots) else: if verbose: logger.info('Initializing model.') mr_model = MRBRT(mr_data, [spline_model]) if verbose: logger.info('Fitting model.') mr_model.fit_model() if num_submodels > 1: if verbose: logger.info('Scoring submodels.') mr_model.score_model() data = data.set_index('date')[['y', 'se']] if verbose: logger.info('Making prediction.') smooth_data = predict_time_series( day0=day0, dep_var=dep_var, mr_model=mr_model, dep_trans_out=dep_trans_out, diff=diff, ) return data, smooth_data, mr_model