def bucket_data(self): print("bucketting data (method from mmse class)") Dataset.bucket_data( self, additional_cols_to_keep=gutils.to_list(self.index_to_pivot) + gutils.to_list(self.cols_to_pivot), timestamp_cols=['score_date'])
def __init__(self, health_numeric_cols=None, cols_to_pivot=None, index_to_pivot=None, index_to_pivot_baseline=None, agg_funcs=None, timestamp_cols=None, **kwargs): """ :param health_numeric_cols: columns containing health numeric data (for report generation) :param cols_to_pivot: groups to use (needs to be a list, at the moment maximum of 2 groups allowed) :param index_to_pivot: variables to generate stats for :param index_to_pivot_baseline: baseline variables to generate stats for :param agg_funcs: functions to use for reporting. supports list of functions :param timestamp_cols: columns containing timestamp data (e.g. date of measure) :param kwargs: parameters from parent class """ super(DatasetMMSE, self).__init__(**kwargs) self.health_numeric_cols = health_numeric_cols self.cols_to_pivot = gutils.to_list(cols_to_pivot) self.index_to_pivot = gutils.to_list(index_to_pivot) self.index_to_pivot_baseline = gutils.to_list(index_to_pivot_baseline) self.agg_funcs = agg_funcs self.timestamp_cols = timestamp_cols
def my_pivot(df, cols_to_pivot, values, index, aggfunc=pd.Series.nunique): cols_to_pivot = gutils.to_list(cols_to_pivot) pv = df.pivot_table(values=values, index=index, columns=cols_to_pivot[0], aggfunc=aggfunc, margins=True).fillna(0) # in case we have a second group to use for pivot if isinstance(cols_to_pivot, list) and len(cols_to_pivot) > 1: try: # to avoid duplicates between super class and class: for multi-index pv.drop(['organic only', 'All'], level=0, axis=1, inplace=True) # if only 1 agg function pv.drop(['organic only', 'All'], level=1, axis=1, inplace=True) # if more than 1 agg function except: # to avoid duplicates between super class and class: for single index pv.drop(columns=['All'], inplace=True) pv1 = df.pivot_table(values=values, index=index, columns=cols_to_pivot[1], aggfunc=aggfunc, margins=True).fillna(0) pv = pd.concat([pv1, pv], axis=1, sort=True) if isinstance(aggfunc, list): pv = pv.swaplevel(axis=1) pv.columns = ['_'.join(x) for x in pv.columns] # sort by column name pv.sort_index(axis=1, inplace=True) pv.rename(index={'All': 'All_' + index}, inplace=True) pv = pv.loc[:, ~pv.columns.duplicated()] return pv.reindex([x for x in pv.index if x != 'not known'] + ['not known'])
def regression_cleaning(self, normalize=False, dummyfy=False, keep_only_baseline=False): if self.data is None or self.data['data_grouped'] is None: self.prep_data(load_type='all') df = self.data['data_grouped'] if normalize: numeric_cols = [ col for col in df[self.regressors]._get_numeric_data().columns ] cols_to_normalize = [self.to_predict ] + [col for col in numeric_cols] scaler = MinMaxScaler() x = df[cols_to_normalize].values scaled_values = scaler.fit_transform(x) df[cols_to_normalize] = scaled_values if dummyfy: cols_to_dummyfy = df[self.regressors].select_dtypes( include=['object', 'category']).columns dummyfied_df = pd.get_dummies(df[cols_to_dummyfy]) df = pd.concat([df.drop(columns=cols_to_dummyfy), dummyfied_df], axis=1, sort=True) if keep_only_baseline: to_drop = [ col for col in df.columns if ('_baseline' in col) and col.replace( '_baseline', '') not in gutils.to_list(self.to_predict) and col.replace('_baseline', '') in df.columns ] df.drop(columns=to_drop, inplace=True) return df
def lmer_formula(model_type='linear_rdn_int', regressor='score_combined', timestamp='score_date_centered', covariates=None, covariates_slope=False, group='brcid'): # decent explanation of different R models: # https://www.statsmodels.org/stable/examples/notebooks/generated/mixed_lm_example.html # first build covariates string if covariates is None: str_cov = '' else: covariates = gutils.to_list(covariates) str_cov = ' + ' + ' + '.join(covariates) if covariates_slope: add_slope = ' + ' + timestamp + ' * ' str_cov += add_slope + add_slope.join(covariates) # now build formula if model_type == 'linear_rdn_int': # random intercept only, linear model model_str = regressor + ' ~ ' + timestamp + str_cov + ' + (1|' + group + ')' elif model_type == 'linear_rdn_all_no_intercept': # random slope only, no intercept (??) model_str = regressor + ' ~ (' + timestamp + str_cov + ' | ' + group + ')' elif model_type == 'linear_rdn_all': # random slope, random intercept model_str = regressor + ' ~ ' + timestamp + str_cov + ' + (1 + ' + timestamp + ' | ' + group + ')' elif model_type == 'linear_rdn_all_uncorrel': # random effects are constrained to be uncorrelated model_str = regressor + ' ~ 1 + ' + timestamp + str_cov \ + ' + (0 + ' + timestamp + ' | ' + group + ')' \ + ' + (1|' + group + ')' elif model_type == 'quadratic_rdn_int': # random intercept only, quadratic model model_str = regressor + ' ~ ' + timestamp + ' + I(' + timestamp + '^2)' + str_cov + ' + (1|' + group + ')' else: return 'model unknown' return model_str
def __init__( self, file_path, key='brcid', timestamp='age_at_score', # data path and keys baseline_cols=None, na_values=None, # identify columns to_predict='score_combined', regressors=('age_at_score', ), # for regression model to_bucket='age_at_score', bucket_min=50, bucket_max=90, interval=0.5, min_obs=3, # to create groups ): """ create dataset object for trajectories modelling :param file_path: path of file containing data :param key: group identification (generally individual identification, e.g. brcid) :param timestamp: key used as time measure (for baseline values, the oldest/smallest timestamp will be used) :param baseline_cols: columns for which we want to keep baseline values :param na_values: value to use to replace missing data :param to_predict: measure to predict :param regressors: list of regressors for prediction modelling :param to_bucket: on what variable to bucket the data if applicable (will groupby based on this variable) :param bucket_min: min cutoff value for bucketting :param bucket_max: max cutoff value for bucketting :param interval: interval to use for bucketting (needs to be between 0 and 1) :param min_obs: remove individuals having less than min_obs observations """ self.file_path = file_path self.key = key self.timestamp = timestamp self.baseline_cols = gutils.to_list(baseline_cols) self.na_values = na_values self.to_predict = gutils.to_list(to_predict) self.regressors = gutils.to_list(regressors) self.to_bucket = str(to_bucket) self.bucket_min = bucket_min self.bucket_max = bucket_max self.interval = interval self.min_obs = min_obs self.data = None
def bucket_data(self, additional_cols_to_keep=None, timestamp_cols=None): print("bucketting data (method from parent class)") cols_to_keep = list( dict.fromkeys( gutils.to_list(self.key) + gutils.to_list(self.regressors) + gutils.to_list(self.to_bucket) + gutils.to_list(self.to_predict) + gutils.to_list(additional_cols_to_keep))) # only use data within bucket boundaries mask_bucket = (self.data['data'][self.to_bucket] >= self.bucket_min) & (self.data['data'][self.to_bucket] <= self.bucket_max) df = self.data['data'].loc[mask_bucket, cols_to_keep] if self.na_values is not None: df.fillna(self.na_values, inplace=True) # transform bool cols to "yes"/"no" so they are not averaged out in the groupby bool_cols = [ col for col in df.columns if df[col].value_counts().index.isin([0, 1]).all() ] if len(bool_cols) > 0: df[bool_cols] = df[bool_cols].replace({0: 'no', 1: 'yes'}) # detect numerical and categorical columns categoric_col = [ col for col in df.select_dtypes(include=['object', 'category']).columns if (self.key not in col) ] numeric_col = [ col for col in df._get_numeric_data().columns if (col not in [self.key]) ] # group by buckets bucket_col = self.to_bucket + '_upbound' df[bucket_col] = gutils.round_nearest(df[self.to_bucket], self.interval, 'up') # we aggregate by average for numeric variables and baseline value for categorical variables keys = categoric_col + numeric_col values = ['first'] * len(categoric_col) + ['mean'] * len(numeric_col) grouping_dict = dict(zip(keys, values)) df_grouped = df.groupby([self.key] + [bucket_col], as_index=False).agg(grouping_dict) df_grouped = df_grouped.sort_values([self.key, self.to_bucket]) df_grouped['occur'] = df_grouped.groupby( self.key)[self.key].transform('size') df_grouped = df_grouped[(df_grouped['occur'] >= self.min_obs)] df_grouped['counter'] = df_grouped.groupby(self.key).cumcount() + 1 for x in timestamp_cols: df_grouped[x + '_upbound'] = gutils.round_nearest( df_grouped[x], self.interval, 'up') df_grouped[x + '_centered'] = df_grouped[ x + '_upbound'] - df_grouped[x + '_upbound'].min() self.data['data_grouped'] = df_grouped # now update df and df_baseline with patients who made the cut for modelling keys_to_keep = list(df_grouped[self.key].unique()) self.data['data']['include'] = np.where( mask_bucket & (self.data['data'][self.key].isin(keys_to_keep)), 'yes', 'no') self.data['data_baseline']['include'] = np.where( self.data['data_baseline'][self.key].isin(keys_to_keep), 'yes', 'no') return 0
def run_models( model_data=r'C:\Users\K1774755\Downloads\phd\mmse_rebecca\mmse_synthetic_data_20190919.xlsx', to_predict='score_combined', key='brcid', covariates=None, covariates_slope=False, patients_split_col='patient_diagnosis_super_class', timestamps=('score_date_centered', ), complete_case=False, models=('linear_rdn_int', 'linear_rdn_all_no_intercept', 'linear_rdn_all', 'quadratic_rdn_int'), output_file_path=None, conf_int='Wald', REML=True): """ :param model_data: :param to_predict: :param key: :param covariates: :param covariates_slope: :param patients_split_col: :param timestamps: :param complete_case: :param models: :param output_file_path: :param conf_int: which method to compute confidence intervals; 'profile', 'Wald' (default), or 'boot' (parametric bootstrap) :param REML: (bool) whether to fit using restricted maximum likelihood estimation instead of maximum likelihood estimation; default True :return: """ if isinstance(model_data, str) and 'xlsx' in model_data: # load regression data model_data = pd.read_excel(model_data, index_col=None) if covariates is not None: # check covariates actually exist in the model data covariates = to_list(covariates) if not all(elem in model_data.columns for elem in list(covariates)): print('covariates entered not in input data:', [x for x in list(covariates) if x not in model_data.columns]) return pd.DataFrame( {'output': 'failure - covariates not in input data'}, index=[0]) if complete_case: print('all cases:', len(model_data), 'observations, ', len(model_data[key].unique()), 'patients') model_data = model_data.replace({ 'not known': np.nan, 'Not Known': np.nan, 'unknown': np.nan, 'Unknown': np.nan, '[nan-nan]': np.nan }) model_data = model_data.dropna(subset=list(covariates), how='any') print('only complete cases:', len(model_data), 'observations, ', len(model_data[key].unique()), 'patients') if output_file_path is not None: st = datetime.datetime.fromtimestamp( time.time()).strftime('%Y%m%d-%Hh%M') writer = pd.ExcelWriter(output_file_path.replace( '.xlsx', st + '.xlsx'), engine='xlsxwriter') res = pd.DataFrame() col_num = 0 patient_groups = list(model_data[patients_split_col].unique() ) if patients_split_col is not None else ['all'] for patient_group in patient_groups: df_tmp = model_data[model_data.patient_diagnosis_super_class == patient_group] \ if patient_group != 'all' else model_data row_num = 0 for ts in timestamps: for m in models: print('running model:', m, '(patient group:', patient_group, ', timestamp:', ts, ')') formula = lmer_formula(model_type=m, regressor=to_predict, timestamp=ts, covariates=covariates, covariates_slope=covariates_slope, group=key) print('using formula', formula) model = Lmer(formula, data=df_tmp) try: model.fit(REML=REML, conf_int=conf_int) if model.warnings is not None: # try other method if convergence failed model.fit(REML=(not REML), conf_int=conf_int) to_print = print_r_model_output(model) except: print('something went wrong with model fitting') to_print = pd.DataFrame({'output': 'failure'}, index=[0]) to_print = pd.concat([to_print], keys=[patient_group], names=[m]) if output_file_path is not None: to_print.to_excel(writer, startrow=row_num, startcol=col_num) row_num += 2 + len(to_print) else: res = res.append(to_print) if output_file_path is not None: col_num += to_print.shape[1] + 3 if output_file_path is not None: writer.save() return res
def lemmatize_words(words_list): words_list = ' '.join(gutils.to_list(words_list)) res = [tkn.lemma_.lower() for tkn in nlp(words_list)] return res