Exemplo n.º 1
0
    def __init__(self, all_vars):
        self.project = all_vars.params.project
        self.project_vars = all_vars.projects[self.project]
        self.stats_paths = self.project_vars['STATS_PATHS']
        self.stats_params = self.project_vars['STATS_PARAMS']
        self.group_col = self.project_vars['group_col']
        self.dir_stats_home = self.stats_paths["STATS_HOME"]
        self.atlas = ('DK', 'DS', 'DKDS')[1]
        self.get_steps(all_vars)

        print(
            f'    Performing statistical analysis in folder: {self.dir_stats_home}'
        )
        print('    materials located at: {:<50}'.format(
            self.project_vars['materials_DIR'][1]))
        print('    file for analysis: {:<50}'.format(
            self.project_vars['fname_groups']))
        print('    id column: {:<50}'.format(str(self.project_vars['id_col'])))
        print('    group column: {:<50}'.format(
            str(self.project_vars['group_col'])))
        # print('    variables to analyse: {:<50}'.format(str(self.project_vars['variables_for_glm'])))

        self.tab = Table()
        self.preproc = preprocessing.Preprocess()
        self.df_user_stats, self.df_final_grid,\
            self.df_adjusted,\
            self.cols_X,\
            self.groups = MakeGrid(self.project_vars).grid()
Exemplo n.º 2
0
def get_stats_df(len_df_X, atlas, nr_threads, env_name, time_started):
    '''
    script to save the parameters that are used for each specific analysis
    '''

    import sklearn
    import matplotlib

    stats = Table().get_clean_df()
    d = {
        'pandas version': Table().pd_ver,
        'numpy version': np.__version__,
        'matplotlib version': matplotlib.__version__,
        'sklearn version': sklearn.__version__,
        'number of iterations': definitions.prediction_defs['NUM_ITER'],
        'atlas': atlas,
        'nr of features': len_df_X,
        'nr of threads': nr_threads,
        'remote name': env_name,
        'analysis started at': time_started,
    }
    i = 0
    for key in d:
        stats.at[i, 'stats'] = key
        stats.at[i, 'values'] = d[key]
        i += 1
    return stats
Exemplo n.º 3
0
 def __init__(self,
              df,
              params_y,
              ls_cols4anova,
              path2save,
              p_thresh=0.05,
              intercept_thresh=0.05,
              print_not_FS=False):
     self.df = df
     self.params_y = params_y
     self.ls_cols4anova = ls_cols4anova
     self.sig_cols = dict()
     self.tab = Table()
     self.print_not_FS = print_not_FS
     self.fs_struc_meas = fs_definitions.GetFSStructureMeasurement()
     self.run_anova(p_thresh, intercept_thresh, path2save)
Exemplo n.º 4
0
 def __init__(self,
              df,
              group_col,
              ls_cols,
              groups,
              path_save_res,
              p_thresh=0.05):
     self.df = df
     self.group_col = group_col
     self.ls_cols = ls_cols
     self.groups = groups
     self.path_save_res = path_save_res
     self.ls_meas = get_names_of_measurements()
     self.ls_struct = get_names_of_structures()
     self.res_ttest = self.compute_ttest_for_col(p_thresh)
     self.tab = Table()
Exemplo n.º 5
0
 def __init__(self, nimb_vars, fs_vars, proj_vars, f_ids_processed,
              f_GLM_group, FS_GLM_dir):
     self.proj_vars = proj_vars
     self.vars_fs = fs_vars
     self.FS_SUBJECTS_DIR = fs_vars['SUBJECTS_DIR']
     self.NIMB_PROCESSED_FS = fs_vars['NIMB_PROCESSED']
     self.f_ids_processed = f_ids_processed
     self.f_GLM_group = f_GLM_group
     self.FS_GLM_dir = FS_GLM_dir
     self.archive_type = '.zip'
     self.tab = Table()
     self.miss = dict()
     self.ids_4fs_glm = dict()
     self.df = self.tab.get_df(self.f_GLM_group)
     self.bids_ids = self.df[self.proj_vars['id_col']].tolist()
     self.ids_exclude_glm = os.path.join(self.FS_GLM_dir,
                                         'excluded_from_glm.json')
Exemplo n.º 6
0
def linreg_moderation_results(df_X_linreg, ls_cols_X_atlas, group_param,
                              regression_param, path_dir_save_results, group):
    '''perform moderation analysis
    Args:
        df_X_linreg:        pandas.DataFrame with columns and data for analysis
        ls_cols_X_atlas:    ls of cols from df_X_linreg that will be used for linear regression analysis
        group_param:        str, name of the column from df_X_linreg that was used to create the groups
        regression_param:   str, name of the column from df_X_linreg that will be used for regression analysis (e.g., Age)
        path_dir_save:      abspath to save csv file
        group:              group name to use for the results csv file
    Return:
        none
        creates csv file
    '''
    d_result = compute_linreg_data(df_X_linreg, ls_cols_X_atlas, group_param,
                                   regression_param)
    df_result = Table().create_df_from_dict(d_result)
    Table().save_df(
        df_result,
        path.join(path_dir_save_results, f'linreg_moderation_{group}.csv'))
Exemplo n.º 7
0
    def __init__(self,
                file_abspath,
                result_abspath):
        from stats.db_processing import Table
        self.contrasts = fs_definitions.GLMcontrasts['contrasts']
        self.get_explanations()

        self.col_4constrasts = "Contrast"
        self.header = ("ClusterNo",
                      "Max", "VtxMax", "Size(mm^2)", 
                      "TalX", "TalY", "TalZ",
                      "CWP", "CWPLow", "CWPHi",
                      "NVtxs", "WghtVtx",
                      "Annot", self.col_4constrasts, "Explanation")
        self.length_matrix = len(self.header)
        self.content = open(file_abspath, 'r').readlines()
        self.result_abspath = result_abspath
        self.tab = Table()
        self.ls_vals_2chk = self.contrasts.keys()
        self.run()
Exemplo n.º 8
0
    def get_groups_and_variables(self, proj_vars, GLM_file_group, vars_fs):
        """creating working variables and dictionaries
        """
        cols_2use = proj_vars["variables_for_glm"] + [
            self.id_col, self.group_col
        ]
        df_groups_clin = Table().get_df_with_columns(GLM_file_group, cols_2use)

        self.ls_groups = pd.unique(df_groups_clin[self.group_col]).tolist()
        self.ids = self.get_ids_ready4glm(df_groups_clin[self.id_col].tolist(),
                                          vars_fs)

        d_init = df_groups_clin.to_dict()
        self.d_subjid = {}
        self.ls_vars_stats = [key for key in d_init if key != self.id_col]
        for rownr in d_init[self.id_col]:
            _id = d_init[self.id_col][rownr]
            if _id in self.ids:
                self.d_subjid[_id] = {}
                for var in self.ls_vars_stats:
                    self.d_subjid[_id][var] = d_init[var][rownr]
        self.ls_vars_stats.remove(self.group_col)
        self.make_subjects_per_group(df_groups_clin)
Exemplo n.º 9
0
def save_features(dic_feat_comps,
                  expl_variance,
                  file2save,
                  img2save,
                  lang="EN"):
    """features extracted from PCA
        are being saved to a table
        and image
    Args:
        dic_feat_comps = {feature_name: explained_variance}
        expl_variance  = PCA explained_variance_
        file2save      = abspath to the csv file to save the table
        img2save       = abspath to the image png file to save the table
        lang           = language used to describe the results
    """
    df_feat_comps = Table().create_df(dic_feat_comps.values(),
                                      index_col=dic_feat_comps.keys(),
                                      cols=['explained_variance'])
    df_feat_comps.to_csv(file2save)
    plotting.plot_simple(vals=np.cumsum(expl_variance),
                         xlabel=params_lang[lang]['nr_components'],
                         ylabel=params_lang[lang]['expl_cum_var'],
                         path_to_save_file=img2save)
Exemplo n.º 10
0
class ttest_do():
    def __init__(self,
                 df,
                 group_col,
                 ls_cols,
                 groups,
                 path_save_res,
                 p_thresh=0.05):
        self.df = df
        self.group_col = group_col
        self.ls_cols = ls_cols
        self.groups = groups
        self.path_save_res = path_save_res
        self.ls_meas = get_names_of_measurements()
        self.ls_struct = get_names_of_structures()
        self.res_ttest = self.compute_ttest_for_col(p_thresh)
        self.tab = Table()

    def compute_ttest_for_col(self, p_thresh):
        res_4df = {'features': [], 'ttest': [], 'welch': []}
        res = dict()
        for col in self.ls_cols:
            group1 = self.df[self.df[self.group_col] == self.groups[0]][col]
            group2 = self.df[self.df[self.group_col] == self.groups[1]][col]
            ttest_eq_pop_var = stats.ttest_ind(group1, group2, equal_var=True)
            ttest_welch = stats.ttest_ind(group1, group2, equal_var=False)
            if ttest_eq_pop_var[1] < p_thresh:
                meas, struct = get_structure_measurement(
                    col, self.ls_meas, self.ls_struct)
                #print('{:<15} {}'.format(meas, struct))
                res[col] = {
                    '{}, mean'.format(self.groups[0]): stats.tmean(group1),
                    '{}, std'.format(self.groups[1]): stats.tstd(group2),
                    '{}, mean'.format(self.groups[1]): stats.tmean(group2),
                    '{}, std'.format(self.groups[1]): stats.tstd(group2),
                    'ttest': ttest_eq_pop_var[1],
                    'welch': ttest_welch[1],
                    'kurtosis': stats.kurtosis(self.df[self.group_col]),
                    'skewness': stats.skew(self.df[self.group_col])
                }
                res_4df['features'].append(struct + ' (' + meas + ')')
                res_4df['ttest'].append(ttest_eq_pop_var[1])
                res_4df['welch'].append(ttest_welch[1])
        self.save_res(res_4df)
        return res

    def save_res(self, res_4df):
        df_result = self.tab.create_df_from_dict(res_4df)
        df_result.to_csv(os.path.join(self.path_save_res, 'ttest.csv'))
Exemplo n.º 11
0
def save_df_Emmanuelle(df,
                       groups,
                       stats_dic,
                       cols2color_sig,
                       path2save,
                       make_with_colors,
                       extensions=('xlsx', 'csv', 'json')):

    if 'xlsx' in extensions:
        import openpyxl
        import string
        df.to_excel('stats_new.xlsx')
        ########## MERGE MEAN/STD SUB-INDEXES ################
        file = openpyxl.load_workbook('stats_new.xlsx')
        sheet = file['Sheet1']
        alpha = string.ascii_uppercase
        for ltr in range(len(alpha))[1:(2 * len(groups)) + 1:2]:
            cell1, cell2 = alpha[ltr] + str(2), alpha[ltr + 1] + str(2)
            sheet.merge_cells(str(cell1 + ':' + cell2))
        file.save('stats_new.xlsx')

    if 'json' in extensions:
        utilities.save_json(stats_dic, os.path.join(path2save, 'stats.json'))

    if 'csv' in extensions:
        tab = Table()
        tab.save_df(df,
                    os.path.join(path2save, 'stats_new.csv'),
                    sheet_name='stats')

    if make_with_colors:
        save_2xlsx_with_colors_Emmanuelle(df,
                                          'stats_new.xlsx',
                                          path2save,
                                          'stats_wcolors.xlsx',
                                          cols2color_sig=cols2color_sig)
Exemplo n.º 12
0
def feature_ranking(X_scaled, y_transform, cols_X):
    """
    get the ranking of all features
    :param X_scaled:
    :param y_transform:
    :return: the pandas Dataframe of all ranking feature in a sorted way
    """
    clf = RandomForestClassifier()
    feature_selector = RFE(clf)
    feature_selector.fit(X_scaled, y_transform)

    features_rfe_and_rank_df = Table().create_df(feature_selector.ranking_,
                                                 index_col=cols_X,
                                                 cols=['ranking']).sort_values(
                                                     ['ranking'])

    # features_rfe_and_rank_df = pd.DataFrame(feature_selector.ranking_,
    #                                     index=cols_X, columns=['ranking']).sort_values(['ranking'])
    features_rfe_and_rank_df['feature'] = features_rfe_and_rank_df.index
    return features_rfe_and_rank_df['feature'], features_rfe_and_rank_df
Exemplo n.º 13
0
def mkstatisticsf(df_4stats,
                  groups,
                  group_col,
                  path2save,
                  make_with_colors=True):
    '''Creates discriptive statistical file for publication,
        based on provided pandas.DataFrame
        Works only on 2 groups
    Args: df_4stats: pandas.DataFrame
        group: list/ tuple of groups as str/int
        group_col: str() column name in df_4stats that has the group names from group
        path_2save: abspath to save the descrptive files
        make_with_colors: will create an additional .xlsx file with 
                        colored significant results,
                        provided xlwt is installed
    Return:
        json file with results
        .csv file with results
        .xlsx file with results with red colored significant ones
    '''

    tab = Table()
    ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'TTest', 'Welch',
                'ANOVA', 'Bartlett', 'MannWhitneyu', 'Kruskal')

    groups_df = dict()
    for group in groups:
        groups_df[group] = tab.get_df_per_parameter(df_4stats, group_col,
                                                    group)

    stats_dic = dict()
    vals2chk = df_4stats.columns.tolist()
    if group_col in vals2chk:
        vals2chk.remove(group_col)

    cols2color_sig = list()
    groups = list(groups_df.keys())
    group1 = groups_df[groups[0]]
    group2 = groups_df[groups[1]]
    for test in ls_tests:
        for val in vals2chk:
            results, params = get_stats(test, group1[val], group2[val])
            if test in ('mean', 'std', 'kurtosis', 'skewness'):
                key1 = f'{groups[0]}, {params[0]}'
                key2 = f'{groups[1]}, {params[0]}'
            else:
                key1 = f'{test}, {params[0]}'
                key2 = f'{test}, {params[1]}'
                cols2color_sig.append(key2)
            for key in (key1, key2):
                if key not in stats_dic:
                    stats_dic[key] = dict()
            stats_dic[key1][val] = f'{results[0]}'
            stats_dic[key2][val] = f'{results[1]}'

    df = tab.create_df_from_dict(stats_dic)
    tab.save_df(df,
                os.path.join(path2save, 'stats_general.csv'),
                sheet_name='stats')
    utilities.save_json(stats_dic, os.path.join(path2save,
                                                'stats_general.json'))
    if make_with_colors:
        save_2xlsx_with_colors(df,
                               path2save=path2save,
                               cols2color_sig=cols2color_sig)
Exemplo n.º 14
0
class ClusterFile2CSV():

    def __init__(self,
                file_abspath,
                result_abspath):
        from stats.db_processing import Table
        self.contrasts = fs_definitions.GLMcontrasts['contrasts']
        self.get_explanations()

        self.col_4constrasts = "Contrast"
        self.header = ("ClusterNo",
                      "Max", "VtxMax", "Size(mm^2)", 
                      "TalX", "TalY", "TalZ",
                      "CWP", "CWPLow", "CWPHi",
                      "NVtxs", "WghtVtx",
                      "Annot", self.col_4constrasts, "Explanation")
        self.length_matrix = len(self.header)
        self.content = open(file_abspath, 'r').readlines()
        self.result_abspath = result_abspath
        self.tab = Table()
        self.ls_vals_2chk = self.contrasts.keys()
        self.run()

    def run(self):
        d = dict()
        i = 0

        while i < len(self.content):
            line = self.content[i].replace('\n','')
            if self.chk_if_vals_in_line(line):
                expl = self.content[i+1].replace('\n','').replace(';','.')
                d[i] = ['','','','','','','','','','','','','', line, expl,]
                i += 2
            else:
                line = self.clean_nans_from_list(line.split(' '))
                i += 1
                if len(line) != 0:
                    d[i] = line + ['','']
        self.save_2table(d)

    def save_2table(self, d):
        df = self.tab.create_df_from_dict(d).T
        column_names = {i[0]:i[1] for i in list(zip(df.columns, self.header))}
        df = df.rename(columns = column_names)
        df = df.set_index(df[self.col_4constrasts])
        df = df.drop(columns = [self.col_4constrasts])
        self.tab.save_df(df, self.result_abspath)

    def chk_if_vals_in_line(self, line):
        '''will use each value from self.ls_vals_2chk
            if present in the line:
            will return True and break
            else: return False
        '''
        exists     = False

        for val_2chk in self.ls_vals_2chk:
            if val_2chk in line:
                exists = True
                break
        return exists

    def clean_nans_from_list(self, ls):
        for i in ls[::-1]:
            if i == '':
                ls.remove(i)
        return ls

    def get_explanations(self):
        self.explanations = list()
        for key in self.contrasts:
            for file_name in self.contrasts[key]:
                self.explanations.append(self.contrasts[key][file_name][1])
Exemplo n.º 15
0
class ANOVA_do():
    def __init__(self,
                 df,
                 params_y,
                 ls_cols4anova,
                 path2save,
                 p_thresh=0.05,
                 intercept_thresh=0.05,
                 print_not_FS=False):
        self.df = df
        self.params_y = params_y
        self.ls_cols4anova = ls_cols4anova
        self.sig_cols = dict()
        self.tab = Table()
        self.print_not_FS = print_not_FS
        self.fs_struc_meas = fs_definitions.GetFSStructureMeasurement()
        self.run_anova(p_thresh, intercept_thresh, path2save)

    def run_anova(self, p_thresh, intercept_thresh, path2save):
        ls_err = list()
        for param_y in self.params_y:
            x = np.array(self.df[param_y])
            df_result = self.tab.get_clean_df()
            df_result_list = df_result.copy()
            df_result[param_y] = ''
            df_result_list[param_y] = ''
            ix = 1
            ixx = 1
            # print(f'    analysing {len(self.ls_cols4anova)} features for parameter: {param_y}')
            for col in self.ls_cols4anova:
                y = np.array(self.df[col])
                data_tmp = pd.DataFrame({'x': x, col: y})
                model = ols(col + " ~ x", data=data_tmp).fit()
                if model.pvalues.Intercept < p_thresh and model.pvalues.x < intercept_thresh:
                    measurement, structure, ls_err = self.fs_struc_meas.get(
                        col, ls_err)
                    if param_y not in self.sig_cols:
                        self.sig_cols[param_y] = dict()
                    self.sig_cols[param_y][col] = {
                        'rsquared': model.rsquared,
                        'rsquared-adjusted': model.rsquared_adj,
                        'F-statistic': model.fvalue,
                        'AIC': model.aic,
                        'BIC': model.bic,
                        'pvalue_slope': model.pvalues.x,
                        'pvalue_intercept': model.pvalues.Intercept,
                        'tvalue_slope': model.tvalues.x,
                        'tvalue_intercept': model.tvalues.Intercept,
                        'meas': measurement,
                        'struct': structure
                    }
                    df_result_list = self.populate_df(
                        df_result_list, ixx, {
                            param_y: structure,
                            'measure': measurement,
                            'pvalue': '%.4f' % model.pvalues.x
                        })
                    if structure not in df_result[param_y].tolist():
                        df_result = self.populate_df(
                            df_result, ix, {
                                param_y: structure,
                                measurement: '%.4f' % model.pvalues.x
                            })
                        ix += 1
                    else:
                        df_result = self.populate_df(
                            df_result,
                            df_result[param_y].tolist().index(structure),
                            {measurement: '%.4f' % model.pvalues.x})
                    ixx += 1
            self.tab.save_df_tocsv(
                df_result_list,
                path.join(path2save, f'anova_per_significance_{param_y}.csv'))
            self.tab.save_df_tocsv(
                df_result,
                path.join(path2save, f'anova_per_structure_{param_y}.csv'))
        save_json(self.sig_cols,
                  path.join(path2save, f'anova_significant_features.json'))
        if self.print_not_FS:
            print('NOT freesurfer structures: ', ls_err)

    def populate_df(self, df, idx, cols_vals):
        for col in cols_vals:
            df.at[idx, col] = cols_vals[col]
        return df
Exemplo n.º 16
0
class CheckIfReady4GLM():
    def __init__(self, nimb_vars, fs_vars, proj_vars, f_ids_processed,
                 f_GLM_group, FS_GLM_dir):
        self.proj_vars = proj_vars
        self.vars_fs = fs_vars
        self.FS_SUBJECTS_DIR = fs_vars['SUBJECTS_DIR']
        self.NIMB_PROCESSED_FS = fs_vars['NIMB_PROCESSED']
        self.f_ids_processed = f_ids_processed
        self.f_GLM_group = f_GLM_group
        self.FS_GLM_dir = FS_GLM_dir
        self.archive_type = '.zip'
        self.tab = Table()
        self.miss = dict()
        self.ids_4fs_glm = dict()
        self.df = self.tab.get_df(self.f_GLM_group)
        self.bids_ids = self.df[self.proj_vars['id_col']].tolist()
        self.ids_exclude_glm = os.path.join(self.FS_GLM_dir,
                                            'excluded_from_glm.json')

    def chk_if_subjects_ready(self):

        fs_proc_ids = self.get_ids_processed()
        miss_bids_ids = [
            i for i in self.bids_ids if i not in fs_proc_ids.keys()
        ]
        if miss_bids_ids:
            print(
                f'    {len(miss_bids_ids)} IDs are missing from file: {self.f_ids_processed}'
            )
            print(f'        first 5 IDs are: {self.f_ids_processed[:5]}')
            for bids_id in miss_bids_ids:
                self.add_to_miss(bids_id, 'id_missing')

        if len(miss_bids_ids) < len(fs_proc_ids.keys()):
            for bids_id in [
                    i for i in self.bids_ids if i not in miss_bids_ids
            ]:
                fs_proc_id = fs_proc_ids[bids_id].replace(
                    self.archive_type, '')
                if os.path.exists(os.path.join(self.FS_SUBJECTS_DIR, bids_id)):
                    self.ids_4fs_glm[bids_id] = bids_id
                    self.chk_glm_files(bids_id)
                elif os.path.exists(
                        os.path.join(self.FS_SUBJECTS_DIR, fs_proc_id)):
                    self.ids_4fs_glm[bids_id] = fs_proc_id
                    self.chk_glm_files(fs_proc_id)
                else:
                    print(f'id {bids_id} or freesurfer id {fs_proc_id} \
                        are missing from the {self.FS_SUBJECTS_DIR} folder')
                    self.add_to_miss(bids_id, 'id_missing')
            if self.miss.keys():
                print("    missing files and ids: ", self.miss)
                save_json(self.miss, self.ids_exclude_glm, print_space=8)
                subjs_missing = len(self.miss.keys())
                subjs_present = len(self.ids_4fs_glm.keys())
                print(f'    Number of participants ready for FreeSurfer GLM:')
                print(f'        in the folder: {self.FS_SUBJECTS_DIR}')
                print(f'        {subjs_present} present')
                print(f'        {subjs_missing} missing')
                not_ready = [
                    i for i in self.miss if "id_missing" not in self.miss[i]
                ]
                maybe_archived = [i for i in self.miss if i not in not_ready]
                if maybe_archived:
                    print("   MAYBE archived: ", maybe_archived)
                    q = "    EXCEPTION! Some IDs are missing, but they could be archived.\n\
                    Do you want to do glm analysis with current subjects (y) or try to check the archive (n) ? (y/n)\n\
                        (note: if you answer NO, you will be asked to unarchive the \n\
                        processed folders of IDs if they are present in FREESURFER_PROCESSED)"

                    if get_yes_no(q) == 1:
                        self.create_fs_glm_df()
                        return True, list()
                    else:
                        return False, maybe_archived
                if not_ready:
                    print(
                        "    MISSING FILES: these participant CANNOT be included in the GLM analysis: ",
                        not_ready)
                    q = "    EXCEPTION! Some IDs have missing files and they MUST be excluded from analysis.\n\
                    Do you want to continue without excluded IDs ? (y/n)"

                    if get_yes_no(q) == 1:
                        self.create_fs_glm_df()
                        return True, list()
                    else:
                        return False, not_ready
            else:
                self.create_fs_glm_df()
                return True, list()
        else:
            print('    no ids found')
            return False, list()

    def chk_glm_files(self, bids_id):
        '''it is expected that the BIDS IDs are located in FREESURFER -> SUBJECTS_DIR
            script checks if subjects are present
        Args:
            bids_id: ID of the subject to chk
        Return:
            populates list of missing subjects
            populates dict with ids
        '''
        files_not_ok = fs_definitions.ChkFSQcache(self.FS_SUBJECTS_DIR,
                                                  bids_id, self.vars_fs).miss
        if files_not_ok:
            for file in files_not_ok[bids_id]:
                self.add_to_miss(bids_id, file)
            return False
        else:
            return True

    def create_fs_glm_df(self):
        self.rm_missing_ids()
        tmp_id = 'fs_id'
        print('    creating the glm file for FreeSurfer GLM analysis')
        d_ids = {
            self.proj_vars['id_col']:
            [i for i in list(self.ids_4fs_glm.keys())],
            tmp_id: [i for i in list(self.ids_4fs_glm.values())]
        }
        fs_proc_df = self.tab.create_df_from_dict(d_ids)
        fs_proc_df = self.tab.change_index(fs_proc_df,
                                           self.proj_vars['id_col'])
        grid_fs_df_pre = self.tab.change_index(self.df,
                                               self.proj_vars['id_col'])
        self.df_ids = self.tab.join_dfs(grid_fs_df_pre,
                                        fs_proc_df,
                                        how='outer')
        self.df_ids.rename(columns={tmp_id: self.proj_vars['id_col']},
                           inplace=True)
        self.df_ids = self.tab.change_index(self.df_ids,
                                            self.proj_vars['id_col'])
        self.tab.save_df(self.df_ids, self.f_GLM_group)
        PrepareForGLM(self.FS_SUBJECTS_DIR, self.FS_GLM_dir, self.f_GLM_group,
                      self.proj_vars, self.vars_fs)

    def rm_missing_ids(self):
        ls_ix_2rm = list()
        for ix in self.df.index:
            bids_id = self.df.at[ix, self.proj_vars['id_col']]
            if bids_id not in self.ids_4fs_glm.keys():
                ls_ix_2rm.append(ix)
        len_miss = len(ls_ix_2rm)
        if len_miss == 0:
            print(f'        ALL subjects are present')
        else:
            print(f'        {len_miss} subjects are missing')
            print(f'            they will be removed from futher analysis')
        self.df = self.df.drop(ls_ix_2rm)

    def get_ids_processed(self):
        '''retrieves the bids names of the IDs provided in the GLM file.
            It is expected that each project had a group of subjects that are present in the dataset
            it is expected that BIDS names are the ones used in the groups_glm file for the ids
            the f_ids.json has the BIDS names of the subjects, and for each BIDS name
            has the corresponding names of the source file/freesurfer/nilearn/dipy processed ziped files
            see nimb/example/f_ids.json
        '''
        print('    extracting list of ids that were processed with FreeSurfer')
        print(f'        in the file{self.f_ids_processed}')
        self.ids_bids_proc_all = self.read_json(self.f_ids_processed)
        return {
            i: self.ids_bids_proc_all[i][DEFAULT.freesurfer_key]
            for i in self.ids_bids_proc_all
        }
        # return {i: 'path' for i in self.ids_bids_proc_all if self.ids_bids_proc_all[i]['source'] in ids_src_glm_file} #old version

    def add_to_miss(self, bids_id, file):
        '''add to the list of missing subjects
        '''
        if bids_id not in self.miss:
            self.miss[bids_id] = list()
        self.miss[bids_id].append(file)
        if bids_id in self.ids_4fs_glm:
            self.ids_4fs_glm.pop(bids_id, None)

    def read_json(self, f):
        '''read a json file
        '''
        with open(f, 'r') as jf:
            return json.load(jf)
Exemplo n.º 17
0
def mkstatisticsf_Emmanuelle(
    df_4stats,
    groups,
    group_col,
):
    '''Creates discriptive statistical file for publication,
        based on provided pandas.DataFrame
        Works only on 2 groups
        author: version adjusted by Emmanuelle Mazur-Lainé 202206
    Args: df_4stats: pandas.DataFrame
        group: list/ tuple of groups as str/int
        group_col: column name in df_4stats that has the group names from group
        path_2save: abspath to save the descriptive files
        make_with_colors: will create an additional .xlsx file with 
                        colored significant results,
                        provided xlwt is installed
    Return:
        json file with results
        .csv file with results
        .xlsx file with results with red colored significant ones
    '''
    df_4stats = df_4stats.astype(float)

    tab = Table()

    groups_df = dict()
    for group in groups:
        groups_df[group] = tab.get_df_per_parameter(df_4stats, group_col,
                                                    group)

    stats_dic = dict()
    vals2chk = df_4stats.columns.tolist()

    if group_col in vals2chk:
        vals2chk.remove(group_col)

    cols2color_sig = list()
    groups = list(groups_df.keys())

    ################################
    if len(groups) == 1:
        ls_tests = ('mean', 'std', 'kurtosis', 'skewness')
    elif len(groups) <= 2:
        ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'TTest', 'Welch',
                    'MannWhitneyu')
    elif len(groups) > 2:
        ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'Bartlett',
                    'Kruskal', 'ANOVA')

    for test in ls_tests:
        for val in vals2chk:
            values_per_gr = []
            for i in range(0, len(groups)):
                gr_i = groups_df[groups[i]][val].values
                arr = np.array(gr_i)
                arr_without_nan = arr[np.logical_not(np.isnan(arr))]
                values_per_gr.append(arr_without_nan)

            results, params = get_stats_Emmanuelle(test, groups, values_per_gr)

            if test == 'mean':
                for i in range(len(groups)):
                    for tst in ('mean', 'std'):
                        results, params = get_stats_Emmanuelle(
                            tst, groups, values_per_gr)
                        key = f'{groups[i]}, {params}'
                        if key not in stats_dic:
                            stats_dic[key] = dict()
                        stats_dic[key][val] = f'{results[i]}'

            if test in ('kurtosis', 'skewness'):
                for i in range(len(groups)):
                    key = f'{groups[i]}, {params}'
                    if key not in stats_dic:
                        stats_dic[key] = dict()
                    stats_dic[key][val] = f'{results[i]}'

            elif test in ('TTest', 'Welch', 'Bartlett', 'MannWhitneyu',
                          'Kruskal', 'ANOVA'):
                for i in range(len(groups)):
                    key1 = f'{test}, {params[0]}'
                    key2 = f'{test}, {params[1]}'
                    for key in (key1, key2):
                        if key not in stats_dic:
                            stats_dic[key] = dict()
                    stats_dic[key1][val] = f'{results[0]}'
                    stats_dic[key2][val] = f'{results[1]}'

                cols2color_sig.append(key2)

    df = tab.create_df_from_dict(stats_dic)
    df = df.astype(float)

    # Creating new adjusted DataFrame with sub-indexes
    ls_tests_dup = []
    ls_param = []
    ls_keys = list(stats_dic.keys())

    mean_gr_done = False

    for test in ls_tests:
        if test in ('mean', 'std'):
            for i in range(0, len(groups)):
                ls_tests_dup.append('mean/std')
            if mean_gr_done == False:
                for i in range(len(groups)):
                    ls_param.append('gr' + str(i + 1) + ' (val=' +
                                    f'{groups[i]}' + ')')
                    ls_param.append('gr' + str(i + 1) + ' (val=' +
                                    f'{groups[i]}' + ')')
                    mean_gr_done = True

        elif test in ('kurtosis', 'skewness'):
            for i in range(0, len(groups)):
                ls_tests_dup.append(test)
                ls_param.append('gr' + str(i + 1) + ' (val=' + f'{groups[i]}' +
                                ')')

        elif test in ('TTest', 'Welch', 'Bartlett', 'MannWhitneyu', 'Kruskal',
                      'ANOVA'):
            ls_tests_dup.append(test)
            ls_tests_dup.append(test)

    for key in ls_keys[4 * (len(groups)):]:
        ls_param.append((str(key))[-1])

    col = [ls_tests_dup, ls_param]
    tuples = list(zip(*col))

    df_new = pd.DataFrame(df.values,
                          index=pd.Index(df.index),
                          columns=pd.MultiIndex.from_tuples(tuples))

    df_new = df_new.round(3)

    return df_new, stats_dic, cols2color_sig
Exemplo n.º 18
0
class RUN_stats():
    """will run statistical analysis for the provided groups file"""
    def __init__(self, all_vars):
        self.project = all_vars.params.project
        self.project_vars = all_vars.projects[self.project]
        self.stats_paths = self.project_vars['STATS_PATHS']
        self.stats_params = self.project_vars['STATS_PARAMS']
        self.group_col = self.project_vars['group_col']
        self.dir_stats_home = self.stats_paths["STATS_HOME"]
        self.atlas = ('DK', 'DS', 'DKDS')[1]
        self.get_steps(all_vars)

        print(
            f'    Performing statistical analysis in folder: {self.dir_stats_home}'
        )
        print('    materials located at: {:<50}'.format(
            self.project_vars['materials_DIR'][1]))
        print('    file for analysis: {:<50}'.format(
            self.project_vars['fname_groups']))
        print('    id column: {:<50}'.format(str(self.project_vars['id_col'])))
        print('    group column: {:<50}'.format(
            str(self.project_vars['group_col'])))
        # print('    variables to analyse: {:<50}'.format(str(self.project_vars['variables_for_glm'])))

        self.tab = Table()
        self.preproc = preprocessing.Preprocess()
        self.df_user_stats, self.df_final_grid,\
            self.df_adjusted,\
            self.cols_X,\
            self.groups = MakeGrid(self.project_vars).grid()

    def run(self):
        print("running")
        for step in self.steps:
            step2run = self.steps[step]['name']
            if self.steps[step]["run"]:
                print(f"    running step: {step2run}")
                self.run_step(step2run)

    def run_step(self, step2run):
        self.use_features = False
        self.feature_algo = 'PCA'  #'RFE'

        for group in [
                'all',
        ] + self.groups:  #'all' stands for all groups
            df_X, y_labeled, X_scaled, df_clin_group = self.get_X_data_per_group_all_groups(
                group)
            df_with_features, features, features_rfe_and_rank_df = self.get_features_df_per_group(
                group, X_scaled, y_labeled, df_X)

            if group == 'all':
                self.params_y = self.project_vars['variables_for_glm']

                # STEP run general stats
                if step2run == "STEP_stats_ttest":
                    from stats.stats_stats import ttest_do

                    variables = self.params_y + df_X.columns.tolist()
                    dir_2save = varia.get_dir(
                        path.join(self.dir_stats_home, group))
                    ttest_res = ttest_do(self.tab.join_dfs(
                        df_clin_group, df_X),
                                         self.group_col,
                                         variables,
                                         self.groups,
                                         dir_2save,
                                         p_thresh=0.05).res_ttest

                # STEP run ANOVA and Simple Linear Regression
                if step2run == "STEP_Anova":
                    from stats.stats_models import ANOVA_do
                    print('performing ANOVA')
                    sig_cols = self.run_anova(features, 0.05, 0.05)

                if step2run == "STEP_SimpLinReg":
                    print('performing Simple Linear Regression on all columns')
                    from stats.plotting import Make_Plot_Regression, Make_plot_group_difference
                    dir_2save = varia.get_dir(
                        self.stats_paths['simp_lin_reg_dir'])
                    param_features = self.run_anova(features, 1.0, 1.0)
                    Make_Plot_Regression(self.df_final_grid, param_features,
                                         self.group_col, dir_2save)
                    dir_2save = varia.get_dir(self.stats_paths['anova'])
                    Make_plot_group_difference(self.df_final_grid,
                                               param_features, self.group_col,
                                               self.groups, dir_2save)

                    # from stats.stats_groups_anova import RUN_GroupAnalysis_ANOVA_SimpleLinearRegression
                    # dir_2save = varia.get_dir(path.join(self.dir_stats_home,
                    #                                     self.stats_paths['anova']+"_"+group))
                    # RUN_GroupAnalysis_ANOVA_SimpleLinearRegression(self.df_final_grid,
                    #                                         groups,
                    #                                         self.params_y,
                    #                                         self.project_vars['other_params'],
                    #                                         dir_2save,
                    #                                         self.group_col,
                    #                                         features)

                # STEP run ANOVA and Simple Logistic Regression
                if step2run == "STEP_LogisticRegression":
                    from stats import stats_LogisticRegression
                    print('performing Logistic Regression for all groups')
                    dir_2save = varia.get_dir(
                        path.join(
                            self.dir_stats_home,
                            self.stats_paths['logistic_regression_dir'] + "_" +
                            group))
                    stats_LogisticRegression.Logistic_Regression(
                        X_scaled, y_labeled, self.group_col, dir_2save)

                # STEP run Prediction RF SKF
                if step2run == "STEP_Predict_RF_SKF":
                    print('    performing RF SKF Prediction for all groups')
                    df_X_scaled = self.tab.create_df(X_scaled,
                                                     index_col=range(
                                                         X_scaled.shape[0]),
                                                     cols=self.cols_X)
                    accuracy, best_estimator, average_score_list, _ = predict.SKF_algorithm(
                        features, df_X_scaled[features].values, y_labeled)
                    print(
                        "    prediction accuracy computed with RF and SKF based on PCA features is: ",
                        accuracy)
                    # accuracy, best_estimator, average_score_list, _ = predict.SKF_algorithm(
                    #         features_rfe_and_rank_df.feature, df_X_scaled[features_rfe_and_rank_df.feature].values, y_labeled)
                    # print("prediction accuracy computed with RF and SKF based on RFE features is: ",accuracy)

                # STEP run Prediction RF LOO
                if step2run == "STEP_Predict_RF_LOO":
                    print(
                        'performing RF Leave-One_out Prediction for all groups'
                    )
                    df_X_scaled = self.tab.create_df(X_scaled,
                                                     index_col=range(
                                                         X_scaled.shape[0]),
                                                     cols=self.cols_X)
                    accuracy, best_estimator, average_score_list, _ = predict.LOO_algorithm(
                        features, df_X_scaled[features].values, y_labeled)
                    print(
                        "    prediction accuracy computed with RF and SKF based on PCA features is: ",
                        accuracy)
                    accuracy, best_estimator, average_score_list, _ = predict.LOO_algorithm(
                        features_rfe_and_rank_df.feature,
                        df_X_scaled[features_rfe_and_rank_df.feature].values,
                        y_labeled)
                    print(
                        "    prediction accuracy computed with RF and SKF based on RFE features is: ",
                        accuracy)

            else:
                # run Descriptive Statistics
                dir_2save = varia.get_dir(
                    path.join(self.dir_stats_home, 'description'))
                self.run_descriptive_stats(df_clin_group, features, dir_2save)

                # STEP run Linear Regression Moderation
                if step2run == "STEP_LinRegModeration":
                    from stats import stats_models
                    print('performing Linear Regression Moderation analysis')
                    stats_models.linreg_moderation_results(
                        self.df_final_grid, features,
                        self.project_vars['group_param'],
                        self.project_vars['regression_param'],
                        varia.get_dir(
                            path.join(
                                self.dir_stats_home,
                                self.stats_paths['linreg_moderation_dir'])),
                        group)

                # STEP run Laterality
                if step2run == "STEP_Laterality":
                    from processing.atlases.atlas_definitions import RReplace
                    from stats import stats_laterality
                    print('performing Laterality analysis')
                    lhrh_feat_d = RReplace(features).contralateral_features
                    lhrh_features_list = [i for i in lhrh_feat_d.keys()] + [
                        v for v in lhrh_feat_d.values()
                    ]
                    df_with_features_lhrh = self.tab.get_df_from_df(
                        df_X, usecols=sorted(lhrh_features_list))
                    stats_laterality.LateralityAnalysis(
                        df_with_features_lhrh, lhrh_feat_d, group,
                        varia.get_dir(
                            path.join(
                                self.dir_stats_home,
                                self.stats_paths['laterality_dir']))).run()

    def run_descriptive_stats(self, df_clin_group, features, dir_2save):
        print('running descriptive statistics')

    def run_anova(self, features, p_thresh, intercept_thresh):
        from stats.stats_models import ANOVA_do
        dir_2save = varia.get_dir(self.stats_paths['anova'])
        return ANOVA_do(self.df_final_grid,
                        self.params_y,
                        features,
                        dir_2save,
                        p_thresh=p_thresh,
                        intercept_thresh=intercept_thresh).sig_cols

    def get_X_data_per_group_all_groups(self, group):
        # extract X_scaled values for the brain parameters
        predicted_target = self.project_vars["prediction_target"]
        print(f"    predicted target column is: {predicted_target}")
        if not predicted_target:
            predicted_target = self.group_col
        if group == 'all':
            df_clin_group = self.df_user_stats
            df_X = self.df_adjusted
            y_labeled = preprocessing.label_y(self.df_user_stats,
                                              predicted_target)
            X_scaled = preprocessing.scale_X(df_X)
        else:
            df_group = self.tab.get_df_per_parameter(self.df_final_grid,
                                                     self.group_col, group)
            df_clin_group = self.tab.rm_cols_from_df(df_group, self.cols_X)
            df_X = self.tab.rm_cols_from_df(
                df_group,
                [i for i in df_group.columns.tolist() if i not in self.cols_X])
            y_labeled = preprocessing.label_y(df_group, predicted_target)
            X_scaled = preprocessing.scale_X(df_X)
        return df_X, y_labeled, X_scaled, df_clin_group

    def log(self):
        stats = predict.get_stats_df(
            len(cols_X), atlas,
            self.stats_params["prediction_vars"]['nr_threads'],
            definitions.sys.platform,
            time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))

    def get_features_df_per_group(self, group, X_scaled, y_labeled, df_X):
        features_rfe_and_rank_df = 'none'
        if self.use_features:
            if self.feature_algo == 'PCA':  # using PCA
                dir_2save = varia.get_dir(
                    path.join(self.dir_stats_home,
                              self.stats_paths['features']))
                pca_threshold = self.stats_params["prediction_vars"][
                    'pca_threshold']
                features = predict.get_features_based_on_pca(
                    dir_2save, pca_threshold, X_scaled, self.cols_X, group,
                    self.atlas)
            elif self.feature_algo == 'RFE':  # using RFE
                features, features_rfe_and_rank_df = predict.feature_ranking(
                    X_scaled, y_labeled, self.cols_X)
                print("    number of features extracted by RFE: ",
                      len(features_rfe_and_rank_df.feature))
            df_with_features = self.tab.get_df_from_df(df_X, usecols=features)
        else:
            df_with_features = self.tab.get_df_from_df(df_X,
                                                       usecols=self.cols_X)
            features = self.cols_X
        return df_with_features, features, features_rfe_and_rank_df

    def get_steps(self, all_vars):
        self.steps = {
            "groups": {
                "name": "STEP0_make_groups",
                "run": False
            },
            "ttest": {
                "name": "STEP_stats_ttest",
                "run": False
            },
            "anova": {
                "name": "STEP_Anova",
                "run": False
            },
            "simplinreg": {
                "name": "STEP_SimpLinReg",
                "run": False
            },
            "logreg": {
                "name": "STEP_LogisticRegression",
                "run": False
            },
            "predskf": {
                "name": "STEP_Predict_RF_SKF",
                "run": False
            },
            "predloo": {
                "name": "STEP_Predict_RF_LOO",
                "run": False
            },
            "linregmod": {
                "name": "STEP_LinRegModeration",
                "run": False
            },
            "laterality": {
                "name": "STEP_Laterality",
                "run": False
            },
        }
        if all_vars.params.step == 00:
            for i in ("groups", "ttest", "anova", "simplinreg", "logreg",
                      "predskf", "predloo", "linregmod", "laterality"):
                self.steps[i]["run"] = True
        else:
            self.steps[all_vars.params.step]["run"] = True