def fix_occurrences(df):
    col_dict = generate_suffix_dict(df)
    df.loc[:, col_dict['occ']] = (
        df[col_dict['occ']]
        .fillna(0).astype(bool).astype(int)
    )
    return df
def scale(train, val, test):
    """
    SCALING PROTOCOL

    All TSL are MinMaxScaled

    EMS - StandardScaled, not LogNormal because outlier events are truly outliers
    EMA - StandardScaled
    TSL - LogScaled -> StandardScaled or MinMax


    """
    train, val, test = train.copy(), val.copy(), test.copy()
    pref_dict = generate_prefix_dict(train)
    suff_dict = generate_suffix_dict(train)

    curr_ios  = list(set(pref_dict['curr']) &
                    (set(suff_dict['io']) | set(suff_dict['occ'])))
    curr_nums = list(set(pref_dict['curr']) &
                    (set(suff_dict['vitals']) | set(suff_dict['labs'])))
    stand_cols  = pref_dict['ems'] + pref_dict['ema'] + ['age_enc'] + curr_nums
    minmax_cols = pref_dict['tsl'] + curr_ios + ['time_of_day_enc']

    scaler = StandardScaler()
    train.loc[:,stand_cols] = scaler.fit_transform(train.loc[:,stand_cols])
    val.loc[:,stand_cols]   = scaler.fit_transform(val.loc[:,stand_cols])
    test.loc[:,stand_cols]  = scaler.fit_transform(test.loc[:,stand_cols])

    minmax = MinMaxScaler()
    train.loc[:,minmax_cols] = minmax.fit_transform(train.loc[:,minmax_cols])
    val.loc[:,minmax_cols]   = minmax.fit_transform(val.loc[:,minmax_cols])
    test.loc[:,minmax_cols]  = minmax.fit_transform(test.loc[:,minmax_cols])

    return train.round(5), val.round(5), test.round(5)
def ffill_curr(df):
    df = df.copy()
    pref_dict = generate_prefix_dict(df)
    suff_dict = generate_suffix_dict(df)
    df.loc[:,pref_dict['curr']] = (
        df[pref_dict['curr']]
        .fillna(method='ffill'))
    df.loc[:,suff_dict['img']] = (
        df[suff_dict['img']]
        .fillna(method='ffill', limit=6))

    return df
def generate_fold_datasets(x, y, fold_dict, train_means_dict, scalers_dict):
    """
    Usage:
    fold_dataset_generator = generate_fold_datasets(
        x, y, fold_dict, train_means_dict, scalers_dict)
    for fold_split in fold_dataset_generator:
        x_train, x_test, y_train, y_test = fold_split
        # Do something

    Or if only want one fold:
    fold_dataset_generator = generate_fold_datasets(
        x, y, fold_dict, train_means_dict, scalers_dict)
    x_train, x_test, y_train, y_test = next(fold_dataset_generator)
    # Do something

    """
    print('\n>>> Applying preprocessing...')
    suff_dict = generate_suffix_dict(x)
    pref_dict = generate_prefix_dict(x)
    for i, (fold_num, mrns) in enumerate(fold_dict.items()):
        print('Fold {}'.format(i))
        start = time.time()
        xf, yf = x.copy(), y.copy()
        # Fill NA
        train_means, scalers = train_means_dict[i], scalers_dict[i]
        xf.loc[:, suff_dict['img']] = xf.loc[:, suff_dict['img']].fillna(0)
        xf = xf.fillna(value=train_means.to_dict())
        mid = time.time()
        print('Filled NA in {}s'.format(round(mid - start, 1)))

        # Apply scaler objects
        curr_ios = list(
            set(pref_dict['curr'])
            & (set(suff_dict['io']) | set(suff_dict['occ'])))
        curr_nums = list(
            set(pref_dict['curr'])
            & (set(suff_dict['vitals']) | set(suff_dict['labs'])))
        stand_cols = pref_dict['ema'] + ['age_enc'] + curr_nums
        minmax_cols = curr_ios + ['hsa_enc']
        #, 'time_of_day_enc', 'duke_loc_enc',
        #                           'past_sbo_enc', 'raleigh_loc_enc', 'regional_loc_enc',
        #                           'hsa_enc'] + suff_dict['img']
        robust_cols = pref_dict['tsl'] + pref_dict['ems']

        standard, minmax, robust = scalers
        xf.loc[:, stand_cols] = standard.transform(xf[stand_cols])
        xf.loc[:, minmax_cols] = minmax.transform(xf[minmax_cols])
        xf.loc[:, robust_cols] = robust.transform(xf[robust_cols])

        x_train, y_train = xf.drop(mrns), yf.drop(mrns)
        x_test, y_test = xf.loc[mrns], yf.loc[mrns]
        print('Scaled in {}s'.format(round(time.time() - mid, 1)))
        yield x_train, x_test, y_train, y_test
def apply_postsum_transforms(df):
    """
    TRANSFORMATIONS
    - take log+1 of IO's
    - take log+1 of ems
    - take log+1 of tsl
    """
    pref_dict = generate_prefix_dict(df)
    suff_dict = generate_suffix_dict(df)

    log_plus_one = lambda s: np.log(s+1)
    curr_ios = list(set(pref_dict['curr']) & set(suff_dict['io']))
    df.loc[:, curr_ios] = df[curr_ios].apply(log_plus_one)
    df.loc[:, pref_dict['ems']] = df[pref_dict['ems']].apply(log_plus_one)
    df.loc[:, pref_dict['tsl']] = df[pref_dict['tsl']].apply(log_plus_one)
    return df
def cache_preprocessing_info(x, fold_dict):
    print('\n>>> Caching preprocessing...')
    suff_dict = generate_suffix_dict(x)
    pref_dict = generate_prefix_dict(x)
    train_means_dict = dict()
    scalers_dict = dict()
    for i, (fold_num, mrns) in enumerate(fold_dict.items()):
        print('Fold {}'.format(i))
        start = time.time()
        x_train = x.copy().drop(mrns)
        # Calculate train means
        x_train.loc[:,
                    suff_dict['img']] = x_train.loc[:,
                                                    suff_dict['img']].fillna(0)
        train_means = x_train.mean(axis=0)
        x_train = x_train.fillna(value=train_means.to_dict())

        # Fit Scaler objects
        standard = StandardScaler()
        minmax = MinMaxScaler()
        robust = RobustScaler()
        curr_ios = list(
            set(pref_dict['curr'])
            & (set(suff_dict['io']) | set(suff_dict['occ'])))
        curr_nums = list(
            set(pref_dict['curr'])
            & (set(suff_dict['vitals']) | set(suff_dict['labs'])))
        stand_cols = pref_dict['ema'] + ['age_enc'] + curr_nums
        minmax_cols = curr_ios + ['hsa_enc']
        #, 'time_of_day_enc', 'duke_loc_enc',
        #                           'past_sbo_enc', 'raleigh_loc_enc', 'regional_loc_enc',
        #                           'hsa_enc'] + suff_dict['img']
        robust_cols = pref_dict['tsl'] + pref_dict['ems']

        standard.fit(x_train[stand_cols])
        minmax.fit(x_train[minmax_cols])
        robust.fit(x_train[robust_cols])

        train_means_dict[i] = train_means
        scalers_dict[i] = [standard, minmax, robust]
        print('Finished in {}s'.format(round(time.time() - start, 1)))
    return train_means_dict, scalers_dict
def scale(train, test):
    train, test = train.copy(), test.copy()
    pref_dict = generate_prefix_dict(train)
    suff_dict = generate_suffix_dict(train)

    #curr_ios  = list(set(pref_dict['curr']) &
    #                (set(suff_dict['io']) | set(suff_dict['occ'])))
    #curr_nums = list(set(pref_dict['curr']) &
    #                (set(suff_dict['vitals']) | set(suff_dict['labs'])))
    #stand_cols  =  pref_dict['ema'] + ['age_enc'] + curr_nums
    #minmax_cols = pref_dict['tsl'] + curr_ios + pref_dict['ems']

    curr_ios = list(
        set(pref_dict['curr'])
        & (set(suff_dict['io']) | set(suff_dict['occ'])))
    curr_nums = list(
        set(pref_dict['curr'])
        & (set(suff_dict['vitals']) | set(suff_dict['labs'])))
    stand_cols = (
        pref_dict['ema'] + ['age_enc'] + curr_nums
        #+ ['word_log_ratio_img']
    )
    minmax_cols = curr_ios + ['time_of_day_enc'] + ['hsa_enc']
    robust_cols = pref_dict['tsl'] + pref_dict['ems']

    scaler = StandardScaler()
    train.loc[:, stand_cols] = scaler.fit_transform(train.loc[:, stand_cols])
    test.loc[:, stand_cols] = scaler.transform(test.loc[:, stand_cols])

    minmax = MinMaxScaler()
    train.loc[:, minmax_cols] = 2 * minmax.fit_transform(
        train.loc[:, minmax_cols]) - 1
    test.loc[:,
             minmax_cols] = 2 * minmax.transform(test.loc[:, minmax_cols]) - 1

    robust = RobustScaler()
    train.loc[:, robust_cols] = 2 * robust.fit_transform(
        train.loc[:, robust_cols]) - 1
    test.loc[:,
             robust_cols] = 2 * robust.transform(test.loc[:, robust_cols]) - 1
    return train, test
# In[36]:

# In[3]:

get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
from preprocessing_exp_weights import preprocess_exp_weights

(x, y, x_cols) = preprocess_exp_weights(rebuild=False,
                                        time_to_event=True,
                                        scale_feat=False,
                                        fill_null=False,
                                        custom_tag='noimg')
#img2

suff_dict = generate_suffix_dict(x)

# In[4]:

#x = x.drop(list(set(suff_dict['img'])-{'ind12_word_log_ratio_img','ind48_word_log_ratio_img'}), 1)

pref_dict = generate_prefix_dict(x)
suff_dict = generate_suffix_dict(x)
mid_dict = generate_midfix_dict(x)
#mid_dict['bp'] = mid_dict['bp_sys'] + mid_dict['bp_dia']

# In[5]:

pref_dict.keys()
suff_dict.keys()
mid_dict.keys()
def standardize_times(df):

    start = time()
    df['min_datetime'] = (
        df.datetime
        .groupby(level=[0,1])
        .transform(lambda x: x.min())
    )
    to_hour = lambda td: td.total_seconds() // 3600
    df['hour_since_adm'] = (
        (df.datetime - df.min_datetime)
        .transform(to_hour)
        .astype(int)
    )

    df['hsa_enc'] = df['hour_since_adm']

    nonsurg_cutoff = 21*24
    df['max_datetime_hour'] = (
        df.hour_since_adm
        .groupby(level=[0,1])
        .transform(lambda x: min(nonsurg_cutoff, x.max()))
    )

    df['event_hour_enc'] = (
        (df.surg_datetime_enc - df.min_datetime)
        .transform(to_hour)
        .fillna(df['max_datetime_hour'])
    )

    df['time_to_event_enc'] = (
        df.event_hour_enc - df.hour_since_adm - 1
    )

    df['time_of_day_enc'] = (
        (df.hour_since_adm + df.min_datetime.apply(lambda dt: dt.hour)) % 24
    )

    df = filter_populations(df, surg_cutoff=21*24)

    # Filter out measurements after surgery
    df = df[df.hour_since_adm < df.event_hour_enc]

    df = df.drop(['datetime', 'min_datetime',
                  'max_datetime_hour', 'surg_datetime_enc',
                 'adm_datetime_enc', 'event_hour_enc'], 1)

    col_dict = generate_suffix_dict(df)

    mean_columns = (
        df.reset_index()
        .loc[:, col_dict['vitals'] + col_dict['labs']
        + col_dict['img']
        + ['mrn', 'id','hour_since_adm']]
        .groupby(['mrn', 'id','hour_since_adm'])
        .agg('mean')
    )

    sum_columns = (
        df.reset_index().loc[:, col_dict['io'] + col_dict['occ'] + ['mrn', 'id','hour_since_adm']]
        .groupby(['mrn', 'id','hour_since_adm'])
        .agg('sum')
    )

    enc = (
        df.reset_index()[col_dict['enc'] + ['mrn', 'id','hour_since_adm']]
        .groupby(['mrn', 'id','hour_since_adm'])
        .agg('max')
    )

    print('Finished standardizing times in {}s'.format(round(time()-start)))
    res = pd.concat([enc, mean_columns, sum_columns], axis=1)
    return res
def summary_stats(df):

    col_dict = generate_suffix_dict(df)

    max_hour = df.reset_index().hour_since_adm.max()

    df_copy = df.copy()
    curr = df_copy[col_dict['vitals'] + col_dict['labs'] +
                   col_dict['io'] + col_dict['occ']].add_prefix('curr_')
    enc = df_copy[col_dict['enc']]

    df = df.reset_index(level=[0,1], drop=True).reindex(np.arange(max_hour + 1))

    # --- Optimized tsl ---
    io_df  = df[col_dict['io'] + col_dict['occ']]
    num_df = df[col_dict['vitals'] + col_dict['labs']]
    io_nan_mask = (io_df.fillna(0) == 0).astype(int).values
    num_nan_mask = num_df.isna().astype(int).values
    io_tsl_arr  = np.zeros((io_df.shape[0], io_df.shape[1]))
    num_tsl_arr = np.zeros((num_df.shape[0], num_df.shape[1]))
    for i in range(io_df.shape[0]):
        io_tsl_arr[i,:]  = (1 + io_tsl_arr[i-1,:])*io_nan_mask[i,:]
        num_tsl_arr[i,:] = (1 + num_tsl_arr[i-1,:])*num_nan_mask[i,:]

    io_df.loc[:,:] = io_tsl_arr
    num_df.loc[:,:] = num_tsl_arr
    # ---

    ems = pd.concat(
        (expsum(df[col_dict['io'] + col_dict['occ']].fillna(0), zerolifes = [z])
            for z in [6,24,72]),
        axis=1)

    # Take diff between curr and ema
    ema_vitals = pd.concat(
        (df[col_dict['vitals']].values -
         df[col_dict['vitals']].ewm(halflife=halflife).mean().add_prefix(
             'ema{}_'.format(halflife))
            for halflife in [6, 24, 72]),
        axis=1)

    ema_labs = pd.concat(
        (df[col_dict['labs']].values -
            df[col_dict['labs']].ewm(halflife=halflife).mean().add_prefix(
                'ema{}_'.format(halflife))
            for halflife in [12, 48, 144]),
        axis=1)


    ffill_img = pd.concat(
         (df[col_dict['img']].fillna(method='ffill', limit=lim).add_prefix(
             'ind{}_'.format(lim))
             for lim in [24,504]),
         axis=1).fillna(0)

    merged_df = pd.merge(
        pd.concat([enc, curr],axis=1),
        #curr,
        pd.concat([ema_vitals, ema_labs, ems,
                   num_df.add_prefix('tsl_'),
                   io_df.add_prefix('tsl_'),
                   ffill_img
                   ], axis=1),
        left_index=True, right_index=True, how='left')
    return merged_df
Exemplo n.º 11
0
def summary_stats(df):
    """
    Calculate summary stats for each measurement
    of patients.
    """

    col_dict = generate_suffix_dict(df)
    #df.iloc[:2,:] = df.iloc[:2,:].fillna(0)

    enc = df[col_dict['enc']].tail(1).squeeze()

    # Custom agg functions
    def slope_numeric(s):
        s = s.reset_index(level=[0, 1], drop=True)
        slope, r_value, std_err = linear_model(s)
        return slope

    def slope_rolling_sum(s):
        s = s.reset_index(level=[0, 1], drop=True).reindex(range(48))
        s_rolling = s.rolling(6, min_periods=0).sum()
        #TODO: change to linreg
        #slope = linear_model(s_rolling)

        slope, r_value, std_err = linear_model(s_rolling)
        return slope

    def last(s):
        s = s.dropna()
        if len(s) == 0:
            return np.nan
        return s.iloc[-1]

    def tsl(s):
        s = s.dropna()
        if len(s) == 0:
            return 48
        return 48 - max(s.reset_index().hour_since_adm)

    def tsl_occ(s):
        s = s.reset_index(level=[0, 1], drop=True).dropna()
        if len(s) == 0:
            return 48
        nonzero = s[s > 0].index
        if len(nonzero) == 0:
            return 48
        if max(nonzero) == max(nonzero):
            return 48 - max(nonzero)
        else:
            return 48

    numeric_stats = [
        last, tsl, slope_numeric, 'mean', 'std', 'min', 'max', 'skew'
    ]
    io_stats = [last, tsl_occ, slope_rolling_sum, 'sum']
    occ_stats = [tsl_occ, slope_rolling_sum, 'sum']

    numeric_summ = group_stats(df[col_dict['vitals'] + col_dict['labs']],
                               numeric_stats)
    io_summ = group_stats(df[col_dict['io']], io_stats)
    occ_summ = group_stats(df[col_dict['occ']], occ_stats)

    return pd.concat([enc, numeric_summ, io_summ, occ_summ])
Exemplo n.º 12
0
# In[132]:

enc[enc.sbo_poa].reset_index().mrn.nunique()

# In[133]:

454 / 3405

# # VARIABLE missingness

# In[244]:

sbo_presum = pd.read_pickle(
    'data/processed/sbo_exp_presumm_nan_ffill_unscaled_noimg.pickle')
suff_dict = generate_suffix_dict(sbo_presum)

# In[45]:

sbo.index.get_level_values(1).unique()

# In[55]:

from utils import generate_prefix_dict

# In[56]:

pref_dict = generate_prefix_dict(sbo)

# In[18]:
def main(x, y, run_inner_fold=True, compute_perm_imp=True):
    #444
    np.random.seed(449)
    time1 = time.time()
    x, y = x.copy(), y.copy()

    enc = (y.reset_index(level=2,
                         drop=True).reset_index().drop_duplicates().set_index(
                             ['mrn', 'id']))

    # TODO: figure out if this is a problem

    y['hsa'] = y.index.get_level_values(2)
    y.loc[y.any_sbo_surg_enc == 0,
          'time_to_event_enc'] = (y.time_to_event_enc.max() -
                                  y.loc[y.any_sbo_surg_enc == 0, 'hsa'])
    y = y.drop('hsa', 1)
    y.loc[y['time_to_event_enc'] == 0, 'time_to_event_enc'] = 0.01

    num_folds, num_lambdas, num_alphas = 5, 20, 3
    lambdas = np.array([np.exp(x) for x in np.linspace(-4, 6, num_lambdas)])
    #alphas = np.array([x**2 for x in np.linspace(0,1,num_alphas)]).round(3)
    # Don't need to add zero because just round
    alphas = np.array([0.1**x
                       for x in np.linspace(0, 4, num_alphas)[::-1]]).round(3)
    #alphas = np.array([0])

    print('\n>>> Generating {} group stratified folds...'.format(num_folds))
    group_fold_dict = generate_fold_dict(enc, num_folds, 0.01)
    inner_group_fold_dict = group_fold_dict.copy()
    del inner_group_fold_dict[num_folds - 1]

    x_inner = x.drop(group_fold_dict[num_folds - 1])
    y_inner = y.drop(group_fold_dict[num_folds - 1])
    time2 = time.time()
    print('Finished in {}s'.format(round(time2 - time1, 1)))

    if run_inner_fold:
        print('\n>>> Running inner CV with {} folds...'.format(num_folds - 1))
        inner_perf_arr, inner_betas_arr = inner_cv_cox(
            x=x_inner,
            y=y_inner,
            lambdas=lambdas,
            alphas=alphas,
            fold_dict=inner_group_fold_dict)
        time3 = time.time()
        print('Fit finished in {}s'.format(round(time3 - time2, 1)))

        lambda_opt, alpha_opt = get_best_hparams(inner_perf_arr, lambdas,
                                                 alphas)
    else:
        time3 = time.time()
        inner_perf_arr, inner_betas_arr = None, None
        lambda_opt, alpha_opt = 0.25, 0.0
        #lambda_opt, alpha_opt = 1e-3, 0.0

    print(
        '\n>>> Running outer CV with {} folds, \nlambda* = {}, alpha* = {}...'.
        format(num_folds, lambda_opt, alpha_opt))
    # Cache preprocessing
    train_means_dict, scalers_dict = cache_preprocessing_info(
        x, group_fold_dict)
    fold_generator = generate_fold_datasets(x, y, group_fold_dict,
                                            train_means_dict, scalers_dict)
    betas_arr, perf_arr = outer_cv_cox(x, y, lambda_opt, alpha_opt,
                                       group_fold_dict, fold_generator)
    time4 = time.time()
    print('Fit finished in {}s'.format(round(time4 - time3, 1)))

    if compute_perm_imp:
        print('\n>>> Computing permutation importance...')
        pref_dict = generate_prefix_dict(x_train)
        suff_dict = generate_suffix_dict(x_train)
        mid_dict = generate_midfix_dict(x_train)
        mid_dict['bp'] = mid_dict['bp_sys'] + mid_dict['bp_dia']
        #perm_group_dict = {col:[col] for col in x_samp_full.columns}
        fold_generator = generate_fold_datasets(x, y, group_fold_dict,
                                                train_means_dict, scalers_dict)
        perm_group_dict = mid_dict
        perm_imp_df = permutation_importance_cv(x, y, betas_arr,
                                                group_fold_dict,
                                                fold_generator,
                                                perm_group_dict)

    else:
        perm_imp_df = None
    time5 = time.time()
    print('Fit finished in {}s'.format(round(time5 - time4, 1)))

    print('\n>>> Make predictions...')
    fold_generator = generate_fold_datasets(x, y, group_fold_dict,
                                            train_means_dict, scalers_dict)
    y_train_pred, y_test_pred = make_predictions(x, y, betas_arr,
                                                 group_fold_dict,
                                                 fold_generator)
    time6 = time.time()
    print('Predictions finished in {}s'.format(round(time6 - time5, 1)))
    """
    epsilon, lambdas, alphas, num_folds

    generate_fold_dict -> group_fold_dict
    inner_cv_cox -> inner_perf_arr, inner_betas_arr
    outer_cv_cox -> betas_arr, perf_arr
    permutation_importance_cv -> perm_imp_df
    make_predictions -> y_train_pred, y_test_pred
    """
    betas_df = pd.DataFrame(betas_arr, columns=x.columns)

    result_list = [
        lambdas, alphas, lambda_opt, alpha_opt, group_fold_dict,
        inner_perf_arr, inner_betas_arr, betas_df, perf_arr, perm_imp_df,
        y_train_pred, y_test_pred, train_means_dict, scalers_dict
    ]
    return result_list