Пример #1
0
 def bucket_data(self):
     print("bucketting data (method from mmse class)")
     Dataset.bucket_data(
         self,
         additional_cols_to_keep=gutils.to_list(self.index_to_pivot) +
         gutils.to_list(self.cols_to_pivot),
         timestamp_cols=['score_date'])
Пример #2
0
    def __init__(self,
                 health_numeric_cols=None,
                 cols_to_pivot=None,
                 index_to_pivot=None,
                 index_to_pivot_baseline=None,
                 agg_funcs=None,
                 timestamp_cols=None,
                 **kwargs):
        """

        :param health_numeric_cols: columns containing health numeric data (for report generation)
        :param cols_to_pivot: groups to use (needs to be a list, at the moment maximum of 2 groups allowed)
        :param index_to_pivot: variables to generate stats for
        :param index_to_pivot_baseline: baseline variables to generate stats for
        :param agg_funcs: functions to use for reporting. supports list of functions
        :param timestamp_cols: columns containing timestamp data (e.g. date of measure)
        :param kwargs: parameters from parent class
        """
        super(DatasetMMSE, self).__init__(**kwargs)
        self.health_numeric_cols = health_numeric_cols
        self.cols_to_pivot = gutils.to_list(cols_to_pivot)
        self.index_to_pivot = gutils.to_list(index_to_pivot)
        self.index_to_pivot_baseline = gutils.to_list(index_to_pivot_baseline)
        self.agg_funcs = agg_funcs
        self.timestamp_cols = timestamp_cols
Пример #3
0
def my_pivot(df, cols_to_pivot, values, index, aggfunc=pd.Series.nunique):
    cols_to_pivot = gutils.to_list(cols_to_pivot)
    pv = df.pivot_table(values=values,
                        index=index,
                        columns=cols_to_pivot[0],
                        aggfunc=aggfunc,
                        margins=True).fillna(0)
    # in case we have a second group to use for pivot
    if isinstance(cols_to_pivot, list) and len(cols_to_pivot) > 1:
        try:  # to avoid duplicates between super class and class: for multi-index
            pv.drop(['organic only', 'All'], level=0, axis=1,
                    inplace=True)  # if only 1 agg function
            pv.drop(['organic only', 'All'], level=1, axis=1,
                    inplace=True)  # if more than 1 agg function
        except:  # to avoid duplicates between super class and class: for single index
            pv.drop(columns=['All'], inplace=True)
        pv1 = df.pivot_table(values=values,
                             index=index,
                             columns=cols_to_pivot[1],
                             aggfunc=aggfunc,
                             margins=True).fillna(0)
        pv = pd.concat([pv1, pv], axis=1, sort=True)

    if isinstance(aggfunc, list):
        pv = pv.swaplevel(axis=1)
        pv.columns = ['_'.join(x) for x in pv.columns]
    # sort by column name
    pv.sort_index(axis=1, inplace=True)
    pv.rename(index={'All': 'All_' + index}, inplace=True)
    pv = pv.loc[:, ~pv.columns.duplicated()]
    return pv.reindex([x
                       for x in pv.index if x != 'not known'] + ['not known'])
Пример #4
0
 def regression_cleaning(self,
                         normalize=False,
                         dummyfy=False,
                         keep_only_baseline=False):
     if self.data is None or self.data['data_grouped'] is None:
         self.prep_data(load_type='all')
     df = self.data['data_grouped']
     if normalize:
         numeric_cols = [
             col for col in df[self.regressors]._get_numeric_data().columns
         ]
         cols_to_normalize = [self.to_predict
                              ] + [col for col in numeric_cols]
         scaler = MinMaxScaler()
         x = df[cols_to_normalize].values
         scaled_values = scaler.fit_transform(x)
         df[cols_to_normalize] = scaled_values
     if dummyfy:
         cols_to_dummyfy = df[self.regressors].select_dtypes(
             include=['object', 'category']).columns
         dummyfied_df = pd.get_dummies(df[cols_to_dummyfy])
         df = pd.concat([df.drop(columns=cols_to_dummyfy), dummyfied_df],
                        axis=1,
                        sort=True)
     if keep_only_baseline:
         to_drop = [
             col
             for col in df.columns if ('_baseline' in col) and col.replace(
                 '_baseline', '') not in gutils.to_list(self.to_predict)
             and col.replace('_baseline', '') in df.columns
         ]
         df.drop(columns=to_drop, inplace=True)
     return df
Пример #5
0
def lmer_formula(model_type='linear_rdn_int',
                 regressor='score_combined',
                 timestamp='score_date_centered',
                 covariates=None,
                 covariates_slope=False,
                 group='brcid'):
    # decent explanation of different R models:
    # https://www.statsmodels.org/stable/examples/notebooks/generated/mixed_lm_example.html

    # first build covariates string
    if covariates is None:
        str_cov = ''
    else:
        covariates = gutils.to_list(covariates)
        str_cov = ' + ' + ' + '.join(covariates)
        if covariates_slope:
            add_slope = ' + ' + timestamp + ' * '
            str_cov += add_slope + add_slope.join(covariates)

    # now build formula
    if model_type == 'linear_rdn_int':  # random intercept only, linear model
        model_str = regressor + ' ~ ' + timestamp + str_cov + ' + (1|' + group + ')'
    elif model_type == 'linear_rdn_all_no_intercept':  # random slope only, no intercept (??)
        model_str = regressor + ' ~  (' + timestamp + str_cov + ' | ' + group + ')'
    elif model_type == 'linear_rdn_all':  # random slope, random intercept
        model_str = regressor + ' ~  ' + timestamp + str_cov + ' + (1 + ' + timestamp + ' | ' + group + ')'
    elif model_type == 'linear_rdn_all_uncorrel':  # random effects are constrained to be uncorrelated
        model_str = regressor + ' ~  1 + ' + timestamp + str_cov \
                    + ' + (0 + ' + timestamp + ' | ' + group + ')' \
                    + ' + (1|' + group + ')'
    elif model_type == 'quadratic_rdn_int':  # random intercept only, quadratic model
        model_str = regressor + ' ~ ' + timestamp + ' + I(' + timestamp + '^2)' + str_cov + ' + (1|' + group + ')'
    else:
        return 'model unknown'
    return model_str
Пример #6
0
 def __init__(
         self,
         file_path,
         key='brcid',
         timestamp='age_at_score',  # data path and keys
         baseline_cols=None,
         na_values=None,  # identify columns
         to_predict='score_combined',
         regressors=('age_at_score', ),  # for regression model
         to_bucket='age_at_score',
         bucket_min=50,
         bucket_max=90,
         interval=0.5,
         min_obs=3,  # to create groups
 ):
     """
     create dataset object for trajectories modelling
     :param file_path: path of file containing data
     :param key: group identification (generally individual identification, e.g. brcid)
     :param timestamp: key used as time measure (for baseline values, the oldest/smallest timestamp will be used)
     :param baseline_cols: columns for which we want to keep baseline values
     :param na_values: value to use to replace missing data
     :param to_predict: measure to predict
     :param regressors: list of regressors for prediction modelling
     :param to_bucket: on what variable to bucket the data if applicable (will groupby based on this variable)
     :param bucket_min: min cutoff value for bucketting
     :param bucket_max: max cutoff value for bucketting
     :param interval: interval to use for bucketting (needs to be between 0 and 1)
     :param min_obs: remove individuals having less than min_obs observations
     """
     self.file_path = file_path
     self.key = key
     self.timestamp = timestamp
     self.baseline_cols = gutils.to_list(baseline_cols)
     self.na_values = na_values
     self.to_predict = gutils.to_list(to_predict)
     self.regressors = gutils.to_list(regressors)
     self.to_bucket = str(to_bucket)
     self.bucket_min = bucket_min
     self.bucket_max = bucket_max
     self.interval = interval
     self.min_obs = min_obs
     self.data = None
Пример #7
0
    def bucket_data(self, additional_cols_to_keep=None, timestamp_cols=None):
        print("bucketting data (method from parent class)")
        cols_to_keep = list(
            dict.fromkeys(
                gutils.to_list(self.key) + gutils.to_list(self.regressors) +
                gutils.to_list(self.to_bucket) +
                gutils.to_list(self.to_predict) +
                gutils.to_list(additional_cols_to_keep)))
        # only use data within bucket boundaries
        mask_bucket = (self.data['data'][self.to_bucket] >=
                       self.bucket_min) & (self.data['data'][self.to_bucket] <=
                                           self.bucket_max)
        df = self.data['data'].loc[mask_bucket, cols_to_keep]
        if self.na_values is not None:
            df.fillna(self.na_values, inplace=True)
        # transform bool cols to "yes"/"no" so they are not averaged out in the groupby
        bool_cols = [
            col for col in df.columns
            if df[col].value_counts().index.isin([0, 1]).all()
        ]
        if len(bool_cols) > 0:
            df[bool_cols] = df[bool_cols].replace({0: 'no', 1: 'yes'})
        # detect numerical and categorical columns
        categoric_col = [
            col
            for col in df.select_dtypes(include=['object', 'category']).columns
            if (self.key not in col)
        ]
        numeric_col = [
            col for col in df._get_numeric_data().columns
            if (col not in [self.key])
        ]
        # group by buckets
        bucket_col = self.to_bucket + '_upbound'
        df[bucket_col] = gutils.round_nearest(df[self.to_bucket],
                                              self.interval, 'up')
        # we aggregate by average for numeric variables and baseline value for categorical variables
        keys = categoric_col + numeric_col
        values = ['first'] * len(categoric_col) + ['mean'] * len(numeric_col)
        grouping_dict = dict(zip(keys, values))

        df_grouped = df.groupby([self.key] + [bucket_col],
                                as_index=False).agg(grouping_dict)
        df_grouped = df_grouped.sort_values([self.key, self.to_bucket])

        df_grouped['occur'] = df_grouped.groupby(
            self.key)[self.key].transform('size')
        df_grouped = df_grouped[(df_grouped['occur'] >= self.min_obs)]
        df_grouped['counter'] = df_grouped.groupby(self.key).cumcount() + 1
        for x in timestamp_cols:
            df_grouped[x + '_upbound'] = gutils.round_nearest(
                df_grouped[x], self.interval, 'up')
            df_grouped[x + '_centered'] = df_grouped[
                x + '_upbound'] - df_grouped[x + '_upbound'].min()
        self.data['data_grouped'] = df_grouped

        # now update df and df_baseline with patients who made the cut for modelling
        keys_to_keep = list(df_grouped[self.key].unique())
        self.data['data']['include'] = np.where(
            mask_bucket & (self.data['data'][self.key].isin(keys_to_keep)),
            'yes', 'no')
        self.data['data_baseline']['include'] = np.where(
            self.data['data_baseline'][self.key].isin(keys_to_keep), 'yes',
            'no')
        return 0
def run_models(
        model_data=r'C:\Users\K1774755\Downloads\phd\mmse_rebecca\mmse_synthetic_data_20190919.xlsx',
        to_predict='score_combined',
        key='brcid',
        covariates=None,
        covariates_slope=False,
        patients_split_col='patient_diagnosis_super_class',
        timestamps=('score_date_centered', ),
        complete_case=False,
        models=('linear_rdn_int', 'linear_rdn_all_no_intercept',
                'linear_rdn_all', 'quadratic_rdn_int'),
        output_file_path=None,
        conf_int='Wald',
        REML=True):
    """

    :param model_data:
    :param to_predict:
    :param key:
    :param covariates:
    :param covariates_slope:
    :param patients_split_col:
    :param timestamps:
    :param complete_case:
    :param models:
    :param output_file_path:
    :param conf_int: which method to compute confidence intervals; 'profile', 'Wald' (default), or 'boot' (parametric bootstrap)
    :param REML: (bool) whether to fit using restricted maximum likelihood estimation instead of maximum likelihood estimation; default True
    :return:
    """
    if isinstance(model_data,
                  str) and 'xlsx' in model_data:  # load regression data
        model_data = pd.read_excel(model_data, index_col=None)
    if covariates is not None:  # check covariates actually exist in the model data
        covariates = to_list(covariates)
        if not all(elem in model_data.columns for elem in list(covariates)):
            print('covariates entered not in input data:',
                  [x for x in list(covariates) if x not in model_data.columns])
            return pd.DataFrame(
                {'output': 'failure - covariates not in input data'},
                index=[0])
    if complete_case:
        print('all cases:', len(model_data), 'observations, ',
              len(model_data[key].unique()), 'patients')
        model_data = model_data.replace({
            'not known': np.nan,
            'Not Known': np.nan,
            'unknown': np.nan,
            'Unknown': np.nan,
            '[nan-nan]': np.nan
        })
        model_data = model_data.dropna(subset=list(covariates), how='any')
        print('only complete cases:', len(model_data), 'observations, ',
              len(model_data[key].unique()), 'patients')
    if output_file_path is not None:
        st = datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y%m%d-%Hh%M')
        writer = pd.ExcelWriter(output_file_path.replace(
            '.xlsx', st + '.xlsx'),
                                engine='xlsxwriter')

    res = pd.DataFrame()
    col_num = 0
    patient_groups = list(model_data[patients_split_col].unique()
                          ) if patients_split_col is not None else ['all']
    for patient_group in patient_groups:
        df_tmp = model_data[model_data.patient_diagnosis_super_class == patient_group] \
            if patient_group != 'all' else model_data
        row_num = 0
        for ts in timestamps:
            for m in models:
                print('running model:', m, '(patient group:', patient_group,
                      ', timestamp:', ts, ')')
                formula = lmer_formula(model_type=m,
                                       regressor=to_predict,
                                       timestamp=ts,
                                       covariates=covariates,
                                       covariates_slope=covariates_slope,
                                       group=key)
                print('using formula', formula)
                model = Lmer(formula, data=df_tmp)
                try:
                    model.fit(REML=REML, conf_int=conf_int)
                    if model.warnings is not None:  # try other method if convergence failed
                        model.fit(REML=(not REML), conf_int=conf_int)
                    to_print = print_r_model_output(model)
                except:
                    print('something went wrong with model fitting')
                    to_print = pd.DataFrame({'output': 'failure'}, index=[0])
                to_print = pd.concat([to_print],
                                     keys=[patient_group],
                                     names=[m])

                if output_file_path is not None:
                    to_print.to_excel(writer,
                                      startrow=row_num,
                                      startcol=col_num)
                    row_num += 2 + len(to_print)
                else:
                    res = res.append(to_print)

        if output_file_path is not None: col_num += to_print.shape[1] + 3
    if output_file_path is not None: writer.save()
    return res
def lemmatize_words(words_list):
    words_list = ' '.join(gutils.to_list(words_list))
    res = [tkn.lemma_.lower() for tkn in nlp(words_list)]
    return res