示例#1
0
def test_remove_nan_in_covs(df, covs):
    df.loc[:0, covs] = np.nan
    d = MRData()
    with pytest.warns(Warning):
        d.load_df(df, col_obs='obs', col_obs_se='obs_se', col_covs=covs)

    assert d.num_obs == df.shape[0] - 1
示例#2
0
def test_covs(df, covs):
    d = MRData()
    d.load_df(df, col_obs='obs', col_obs_se='obs_se', col_covs=covs)

    num_covs = 0 if covs is None else len(covs)
    num_covs += 1
    assert d.num_covs == num_covs
示例#3
0
def predict_time_series(
    day0: pd.Timestamp,
    dep_var: str,
    mr_model: MRBRT,
    dep_trans_out: Callable[[pd.Series], pd.Series],
    diff: bool,
) -> pd.DataFrame:
    data = mr_model.data.to_df()

    pred_data = MRData()
    t = np.arange(0, data['t'].max() + 1)
    pred_data.load_df(pd.DataFrame({'t': t}), col_covs='t')
    pred_data_value = mr_model.predict(pred_data)
    if diff:
        pred_data_value = pred_data_value.cumsum()
    pred_data_value = dep_trans_out(pred_data_value)
    pred_data = pd.DataFrame({
        't': t,
        dep_var: pred_data_value,
    })
    pred_data['date'] = pred_data['t'].apply(
        lambda x: day0 + pd.Timedelta(days=x))
    pred_data = pred_data.set_index('date')[dep_var]

    return pred_data
示例#4
0
def test_assert_has_covs(df):
    d = MRData()
    d.load_df(df,
              col_obs='obs',
              col_obs_se='obs_se',
              col_covs=['cov0', 'cov1', 'cov2'])
    with pytest.raises(ValueError):
        d._assert_has_covs('cov3')
示例#5
0
def data(df):
    df['study_id'] = np.array([0, 0, 1, 1, 2])
    d = MRData()
    d.load_df(df,
              col_obs='obs',
              col_obs_se='obs_se',
              col_covs=[f'cov{i}' for i in range(3)],
              col_study_id='study_id')
    return d
示例#6
0
def test_normalize_covs(df, covs):
    d = MRData()
    d.load_df(df,
              col_obs='obs',
              col_obs_se='obs_se',
              col_covs=['cov0', 'cov1', 'cov2'])

    d.normalize_covs(covs)
    assert d.is_cov_normalized(covs)
示例#7
0
def test_has_covs(df):
    d = MRData()
    d.load_df(df,
              col_obs='obs',
              col_obs_se='obs_se',
              col_covs=['cov0', 'cov1', 'cov2'])
    assert d.has_covs(['cov0'])
    assert d.has_covs(['cov0', 'cov1'])
    assert not d.has_covs(['cov3'])
示例#8
0
def test_obs(df, obs, obs_se):
    d = MRData()
    d.load_df(df,
              col_obs=obs,
              col_obs_se=obs_se,
              col_covs=['cov0', 'cov1', 'cov2'])
    assert d.obs.size == df.shape[0]
    assert d.obs_se.size == df.shape[0]
    if obs is None:
        assert all(np.isnan(d.obs))
示例#9
0
def test_is_empty(df):
    d = MRData()
    assert d.is_empty()
    d.load_df(df,
              col_obs='obs',
              col_obs_se='obs_se',
              col_covs=['cov0', 'cov1', 'cov2'])
    assert not d.is_empty()
    d.reset()
    assert d.is_empty()
示例#10
0
def test_get_covs(df):
    d = MRData()
    d.load_df(df,
              col_obs='obs',
              col_obs_se='obs_se',
              col_covs=['cov0', 'cov1', 'cov2'])
    for cov_name in ['cov0', 'cov1', 'cov2']:
        assert np.allclose(d.get_covs(cov_name), df[cov_name].to_numpy()[:, None])

    cov_mat = d.get_covs(['cov0', 'cov1', 'cov2'])
    assert np.allclose(cov_mat, df[['cov0', 'cov1', 'cov2']].to_numpy())
示例#11
0
def model_intercept(data: pd.DataFrame,
                    dep_var: str,
                    prediction: pd.Series,
                    weight_data: pd.DataFrame = None,
                    dep_var_se: str = None,
                    dep_trans_in: Callable[[pd.Series],
                                           pd.Series] = lambda x: x,
                    dep_se_trans_in: Callable[[pd.Series],
                                              pd.Series] = lambda x: x,
                    dep_trans_out: Callable[[pd.Series],
                                            pd.Series] = lambda x: x,
                    verbose: bool = True):
    data = data.copy()
    data[dep_var] = dep_trans_in(data[dep_var])
    prediction = dep_trans_in(prediction)
    data = reshape_data_long(data, dep_var)
    if weight_data is not None:
        weight_data = reshape_data_long(weight_data, dep_var_se)
        if (data['date'] != weight_data['date']).any():
            raise ValueError(
                'Dates in `data` and `weight_data` not identical.')
        data['se'] = dep_se_trans_in(weight_data[dep_var_se])
    else:
        data['se'] = 1.
    data = data.set_index('date').sort_index()
    data[dep_var] = data[dep_var] - prediction
    data = data.reset_index().dropna()
    data['intercept'] = 1

    mr_data = MRData()
    mr_data.load_df(
        data,
        col_obs=dep_var,
        col_obs_se='se',
        col_covs=['intercept'],
        col_study_id='date',
    )
    intercept_model = LinearCovModel(
        'intercept',
        use_re=False,
    )
    mr_model = MRBRT(mr_data, [intercept_model])
    mr_model.fit_model()

    intercept = mr_model.beta_soln

    prediction += intercept
    prediction = dep_trans_out(prediction)

    return prediction
示例#12
0
def mrdata(seed=123):
    np.random.seed(seed)
    data = pd.DataFrame({
        'obs': np.random.randn(10),
        'obs_se': np.full(10, 0.1),
        'cov0': np.ones(10),
        'cov1': np.random.randn(10),
        'study_id': np.random.choice(range(3), 10)
    })
    mrdata = MRData()
    mrdata.load_df(data,
                   col_obs='obs',
                   col_obs_se='obs_se',
                   col_covs=['cov0', 'cov1'],
                   col_study_id='study_id')
    return mrdata
示例#13
0
def test_data_id(df, study_id):
    if study_id is not None:
        df['study_id'] = study_id
        col_study_id = 'study_id'
    else:
        col_study_id = None

    d = MRData()
    d.load_df(df,
              col_obs='obs',
              col_obs_se='obs_se',
              col_covs=['cov0', 'cov1', 'cov2'],
              col_study_id=col_study_id)

    d._sort_by_data_id()
    assert np.allclose(d.obs, df['obs'])
    assert np.allclose(d.obs_se, df['obs_se'])
    for i in range(3):
        assert np.allclose(d.covs[f'cov{i}'], df[f'cov{i}'])
示例#14
0
def test_study_id(df, study_id):
    if study_id is not None:
        df['study_id'] = study_id
        col_study_id = 'study_id'
    else:
        col_study_id = None
    d = MRData()
    d.load_df(df,
              col_obs='obs',
              col_obs_se='obs_se',
              col_covs=['cov0', 'cov1', 'cov2'],
              col_study_id=col_study_id)

    if col_study_id is None:
        assert np.all(d.study_id == 'Unknown')
        assert d.num_studies == 1
        assert d.studies[0] == 'Unknown'
    else:
        assert np.allclose(d.study_id, np.array([0, 0, 1, 1, 2]))
        assert d.num_studies == 3
        assert np.allclose(d.studies, np.array([0, 1, 2]))
        assert np.allclose(d.study_sizes, np.array([2, 2, 1]))
示例#15
0
文件: plots.py 项目: rmbarber/mrtool
def plot_risk_function(mrbrt, pair, beta_samples, gamma_samples, alt_cov_names=None, 
    ref_cov_names=None, continuous_variables=[], plot_note=None, plots_dir=None, 
    write_file=False):
    """Plot predicted relative risk.
    Args:
        mrbrt (mrtool.MRBRT):
            MRBeRT object.
        pair (str):
            risk_outcome pair. eg. 'redmeat_colorectal'
        beta_samples (np.ndarray):
            Beta samples generated using `sample_soln` function in MRBRT
        gamma_samples (np.ndarray):
            Gamma samples generated using `sample_soln` function in MRBRT
        alt_cov_names (List[str], optional):
            Name of the alternative exposures, if `None` use `['b_0', 'b_1']`.
            Default to `None`.
        ref_cov_names (List[str], optional):
            Name of the reference exposures, if `None` use `['a_0', 'a_1']`.
            Default to `None`.
        continuous_variables (list):
            List of continuous covariate names.
        plot_note (str):
            The notes intended to be written on the title.
        plots_dir (str):
            Directory where to save the plot.
        write_file (bool):
            Specify `True` if the plot is expected to be saved on disk.
            If True, `plots_dir` should be specified too.
    """
    data_df = mrbrt.data.to_df()
    sub = mrbrt.sub_models[0]
    knots = sub.get_cov_model(mrbrt.ensemble_cov_model_name).spline.knots
    min_cov = knots[0]
    max_cov = knots[-1]
    dose_grid = np.linspace(min_cov, max_cov)
    col_covs = sub.cov_names
    pred_df = pd.DataFrame(dict(zip(col_covs, np.zeros(len(col_covs)))), 
        index=np.arange(len(dose_grid)))

    alt_cov_names = ['b_0', 'b_1'] if alt_cov_names is None else alt_cov_names
    ref_cov_names = ['a_0', 'a_1'] if ref_cov_names is None else ref_cov_names
    pred_df['intercept'] = 1
    pred_df[alt_cov_names[0]] = dose_grid
    pred_df[alt_cov_names[1]] = dose_grid
    pred_df[ref_cov_names[0]] = knots[0]
    pred_df[ref_cov_names[1]] = knots[0]
    
    # if it's continuous variables, take median 
    for var in continuous_variables:
        pred_df[var] = np.median(data_df[var])

    pred_data = MRData()
    pred_data.load_df(pred_df, col_covs=col_covs)

    y_draws = mrbrt.create_draws(pred_data, beta_samples, gamma_samples, random_study=True)
    y_draws_fe = mrbrt.create_draws(pred_data, beta_samples, gamma_samples, random_study=False)

    num_samples = y_draws_fe.shape[1]
    sort_index = np.argsort(y_draws_fe[-1])
    trimmed_draws = y_draws_fe[:, sort_index[int(num_samples*0.01): -int(num_samples*0.01)]]
    patch_index = np.random.choice(trimmed_draws.shape[1], 
        y_draws_fe.shape[1] - trimmed_draws.shape[1], replace=True)
    y_draws_fe = np.hstack((trimmed_draws, trimmed_draws[:, patch_index]))
    
    y_mean_fe = np.mean(y_draws_fe, axis=1)
    y_lower_fe = np.percentile(y_draws_fe, 2.5, axis=1)
    y_upper_fe = np.percentile(y_draws_fe, 97.5, axis=1)
    
    plt.rcParams['axes.edgecolor'] = '0.15'
    plt.rcParams['axes.linewidth'] = 0.5

    plt.plot(dose_grid, np.exp(y_lower_fe), c='gray')
    plt.plot(dose_grid, np.exp(y_upper_fe), c='gray')
    plt.plot(dose_grid, np.exp(y_mean_fe), c='red')
    plt.ylim([np.exp(y_lower_fe).min() - np.exp(y_mean_fe).ptp()*0.1,
              np.exp(y_upper_fe).max() + np.exp(y_mean_fe).ptp()*0.1])
    plt.ylabel('RR', fontsize=10)
    plt.xlabel("Exposure", fontsize=10)
    
    if plot_note is not None:
        plt.title(plot_note)

    # save plot    
    if write_file:
        assert plots_dir is not None, "plots_dir is not specified!"
        outfile = os.path.join(plots_dir, f'{pair}_risk_function.pdf')
        plt.savefig(outfile, bbox_inches='tight')
        print(f"Risk function plot saved at {outfile}")
    else:
        plt.show()
    plt.close()
示例#16
0
def estimate_time_series(
    data: pd.DataFrame,
    spline_options: Dict,
    n_knots: int,
    dep_var: str,
    dep_trans_in: Callable[[pd.Series], pd.Series] = lambda x: x,
    weight_data: pd.DataFrame = None,
    dep_var_se: str = None,
    dep_se_trans_in: Callable[[pd.Series], pd.Series] = lambda x: x,
    diff: bool = False,
    num_submodels: int = 25,
    single_random_knot: bool = False,
    min_interval_days: int = 7,
    dep_trans_out: Callable[[pd.Series], pd.Series] = lambda x: x,
    split_l_interval: bool = False,
    split_r_interval: bool = False,
    verbose: bool = False,
) -> Tuple[pd.DataFrame, pd.Series, MRBeRT]:
    if verbose: logger.info('Formatting data.')
    data = data.copy()
    data[dep_var] = dep_trans_in(data[dep_var])
    if diff:
        if verbose:
            logger.info(
                'For diff model, drop day1 (i.e., if day0 is > 0, day0->day1 diff would be hugely negative).'
            )
        data[dep_var] = data[dep_var].diff()
        data[dep_var] = data[dep_var][data[dep_var].diff().notnull()]
    if data[[dep_var]].shape[1] > 1:
        reshape = True
        data = reshape_data_long(data, dep_var)
        if weight_data is not None:
            weight_data = reshape_data_long(weight_data, dep_var_se)
    else:
        reshape = False
    if weight_data is not None:
        if (data['date'] != weight_data['date']).any():
            raise ValueError(
                'Dates in `data` and `weight_data` not identical.')
        data['se'] = dep_se_trans_in(weight_data[dep_var_se])
    else:
        data['se'] = 1.
    data = data.rename(columns={dep_var: 'y'})
    day0 = data['date'].min()
    keep_vars = ['date', 'y', 'se']
    data = data.loc[:, keep_vars]
    start_len = len(data)
    data = data.dropna()
    end_len = len(data)
    if start_len != end_len and not reshape:
        if verbose: logger.debug('NAs in data')
    data['t'] = (data['date'] - day0).dt.days

    col_args = {
        'col_obs': 'y',
        'col_obs_se': 'se',
        'col_covs': ['t'],
        #'col_study_id':'date',
    }
    if verbose: logger.info('Getting base knots.')
    min_interval = min_interval_days / data['t'].max()
    if num_submodels == 1 and single_random_knot:
        spline_knots = get_ensemble_knots(n_knots, min_interval, 1)[0]
    else:
        spline_knots = np.linspace(0., 1., n_knots)

    if split_l_interval or split_r_interval:
        if num_submodels > 1:
            raise ValueError(
                'Would need to set up functionality to split segments for ensemble.'
            )
        if split_l_interval:
            n_knots += 1
            spline_knots = np.insert(spline_knots, 0, spline_knots[:2].mean())
        if split_r_interval:
            n_knots += 1
            spline_knots = np.insert(spline_knots, -1,
                                     spline_knots[-2:].mean())

    if verbose: logger.info('Creating model data.')
    mr_data = MRData()
    mr_data.load_df(data, **col_args)
    spline_model = LinearCovModel('t',
                                  use_re=False,
                                  use_spline=True,
                                  use_spline_intercept=True,
                                  spline_knots=spline_knots,
                                  **spline_options)
    if num_submodels > 1:
        if verbose: logger.info('Sampling knots.')
        ensemble_knots = get_ensemble_knots(n_knots, min_interval,
                                            num_submodels)

        if verbose: logger.info('Initializing model.')
        mr_model = MRBeRT(mr_data, spline_model, ensemble_knots)
    else:
        if verbose: logger.info('Initializing model.')
        mr_model = MRBRT(mr_data, [spline_model])

    if verbose: logger.info('Fitting model.')
    mr_model.fit_model()

    if num_submodels > 1:
        if verbose: logger.info('Scoring submodels.')
        mr_model.score_model()

    data = data.set_index('date')[['y', 'se']]

    if verbose: logger.info('Making prediction.')
    smooth_data = predict_time_series(
        day0=day0,
        dep_var=dep_var,
        mr_model=mr_model,
        dep_trans_out=dep_trans_out,
        diff=diff,
    )

    return data, smooth_data, mr_model