Пример #1
0
class ttest_do():
    def __init__(self,
                 df,
                 group_col,
                 ls_cols,
                 groups,
                 path_save_res,
                 p_thresh=0.05):
        self.df = df
        self.group_col = group_col
        self.ls_cols = ls_cols
        self.groups = groups
        self.path_save_res = path_save_res
        self.ls_meas = get_names_of_measurements()
        self.ls_struct = get_names_of_structures()
        self.res_ttest = self.compute_ttest_for_col(p_thresh)
        self.tab = Table()

    def compute_ttest_for_col(self, p_thresh):
        res_4df = {'features': [], 'ttest': [], 'welch': []}
        res = dict()
        for col in self.ls_cols:
            group1 = self.df[self.df[self.group_col] == self.groups[0]][col]
            group2 = self.df[self.df[self.group_col] == self.groups[1]][col]
            ttest_eq_pop_var = stats.ttest_ind(group1, group2, equal_var=True)
            ttest_welch = stats.ttest_ind(group1, group2, equal_var=False)
            if ttest_eq_pop_var[1] < p_thresh:
                meas, struct = get_structure_measurement(
                    col, self.ls_meas, self.ls_struct)
                #print('{:<15} {}'.format(meas, struct))
                res[col] = {
                    '{}, mean'.format(self.groups[0]): stats.tmean(group1),
                    '{}, std'.format(self.groups[1]): stats.tstd(group2),
                    '{}, mean'.format(self.groups[1]): stats.tmean(group2),
                    '{}, std'.format(self.groups[1]): stats.tstd(group2),
                    'ttest': ttest_eq_pop_var[1],
                    'welch': ttest_welch[1],
                    'kurtosis': stats.kurtosis(self.df[self.group_col]),
                    'skewness': stats.skew(self.df[self.group_col])
                }
                res_4df['features'].append(struct + ' (' + meas + ')')
                res_4df['ttest'].append(ttest_eq_pop_var[1])
                res_4df['welch'].append(ttest_welch[1])
        self.save_res(res_4df)
        return res

    def save_res(self, res_4df):
        df_result = self.tab.create_df_from_dict(res_4df)
        df_result.to_csv(os.path.join(self.path_save_res, 'ttest.csv'))
Пример #2
0
class ClusterFile2CSV():

    def __init__(self,
                file_abspath,
                result_abspath):
        from stats.db_processing import Table
        self.contrasts = fs_definitions.GLMcontrasts['contrasts']
        self.get_explanations()

        self.col_4constrasts = "Contrast"
        self.header = ("ClusterNo",
                      "Max", "VtxMax", "Size(mm^2)", 
                      "TalX", "TalY", "TalZ",
                      "CWP", "CWPLow", "CWPHi",
                      "NVtxs", "WghtVtx",
                      "Annot", self.col_4constrasts, "Explanation")
        self.length_matrix = len(self.header)
        self.content = open(file_abspath, 'r').readlines()
        self.result_abspath = result_abspath
        self.tab = Table()
        self.ls_vals_2chk = self.contrasts.keys()
        self.run()

    def run(self):
        d = dict()
        i = 0

        while i < len(self.content):
            line = self.content[i].replace('\n','')
            if self.chk_if_vals_in_line(line):
                expl = self.content[i+1].replace('\n','').replace(';','.')
                d[i] = ['','','','','','','','','','','','','', line, expl,]
                i += 2
            else:
                line = self.clean_nans_from_list(line.split(' '))
                i += 1
                if len(line) != 0:
                    d[i] = line + ['','']
        self.save_2table(d)

    def save_2table(self, d):
        df = self.tab.create_df_from_dict(d).T
        column_names = {i[0]:i[1] for i in list(zip(df.columns, self.header))}
        df = df.rename(columns = column_names)
        df = df.set_index(df[self.col_4constrasts])
        df = df.drop(columns = [self.col_4constrasts])
        self.tab.save_df(df, self.result_abspath)

    def chk_if_vals_in_line(self, line):
        '''will use each value from self.ls_vals_2chk
            if present in the line:
            will return True and break
            else: return False
        '''
        exists     = False

        for val_2chk in self.ls_vals_2chk:
            if val_2chk in line:
                exists = True
                break
        return exists

    def clean_nans_from_list(self, ls):
        for i in ls[::-1]:
            if i == '':
                ls.remove(i)
        return ls

    def get_explanations(self):
        self.explanations = list()
        for key in self.contrasts:
            for file_name in self.contrasts[key]:
                self.explanations.append(self.contrasts[key][file_name][1])
Пример #3
0
class CheckIfReady4GLM():
    def __init__(self, nimb_vars, fs_vars, proj_vars, f_ids_processed,
                 f_GLM_group, FS_GLM_dir):
        self.proj_vars = proj_vars
        self.vars_fs = fs_vars
        self.FS_SUBJECTS_DIR = fs_vars['SUBJECTS_DIR']
        self.NIMB_PROCESSED_FS = fs_vars['NIMB_PROCESSED']
        self.f_ids_processed = f_ids_processed
        self.f_GLM_group = f_GLM_group
        self.FS_GLM_dir = FS_GLM_dir
        self.archive_type = '.zip'
        self.tab = Table()
        self.miss = dict()
        self.ids_4fs_glm = dict()
        self.df = self.tab.get_df(self.f_GLM_group)
        self.bids_ids = self.df[self.proj_vars['id_col']].tolist()
        self.ids_exclude_glm = os.path.join(self.FS_GLM_dir,
                                            'excluded_from_glm.json')

    def chk_if_subjects_ready(self):

        fs_proc_ids = self.get_ids_processed()
        miss_bids_ids = [
            i for i in self.bids_ids if i not in fs_proc_ids.keys()
        ]
        if miss_bids_ids:
            print(
                f'    {len(miss_bids_ids)} IDs are missing from file: {self.f_ids_processed}'
            )
            print(f'        first 5 IDs are: {self.f_ids_processed[:5]}')
            for bids_id in miss_bids_ids:
                self.add_to_miss(bids_id, 'id_missing')

        if len(miss_bids_ids) < len(fs_proc_ids.keys()):
            for bids_id in [
                    i for i in self.bids_ids if i not in miss_bids_ids
            ]:
                fs_proc_id = fs_proc_ids[bids_id].replace(
                    self.archive_type, '')
                if os.path.exists(os.path.join(self.FS_SUBJECTS_DIR, bids_id)):
                    self.ids_4fs_glm[bids_id] = bids_id
                    self.chk_glm_files(bids_id)
                elif os.path.exists(
                        os.path.join(self.FS_SUBJECTS_DIR, fs_proc_id)):
                    self.ids_4fs_glm[bids_id] = fs_proc_id
                    self.chk_glm_files(fs_proc_id)
                else:
                    print(f'id {bids_id} or freesurfer id {fs_proc_id} \
                        are missing from the {self.FS_SUBJECTS_DIR} folder')
                    self.add_to_miss(bids_id, 'id_missing')
            if self.miss.keys():
                print("    missing files and ids: ", self.miss)
                save_json(self.miss, self.ids_exclude_glm, print_space=8)
                subjs_missing = len(self.miss.keys())
                subjs_present = len(self.ids_4fs_glm.keys())
                print(f'    Number of participants ready for FreeSurfer GLM:')
                print(f'        in the folder: {self.FS_SUBJECTS_DIR}')
                print(f'        {subjs_present} present')
                print(f'        {subjs_missing} missing')
                not_ready = [
                    i for i in self.miss if "id_missing" not in self.miss[i]
                ]
                maybe_archived = [i for i in self.miss if i not in not_ready]
                if maybe_archived:
                    print("   MAYBE archived: ", maybe_archived)
                    q = "    EXCEPTION! Some IDs are missing, but they could be archived.\n\
                    Do you want to do glm analysis with current subjects (y) or try to check the archive (n) ? (y/n)\n\
                        (note: if you answer NO, you will be asked to unarchive the \n\
                        processed folders of IDs if they are present in FREESURFER_PROCESSED)"

                    if get_yes_no(q) == 1:
                        self.create_fs_glm_df()
                        return True, list()
                    else:
                        return False, maybe_archived
                if not_ready:
                    print(
                        "    MISSING FILES: these participant CANNOT be included in the GLM analysis: ",
                        not_ready)
                    q = "    EXCEPTION! Some IDs have missing files and they MUST be excluded from analysis.\n\
                    Do you want to continue without excluded IDs ? (y/n)"

                    if get_yes_no(q) == 1:
                        self.create_fs_glm_df()
                        return True, list()
                    else:
                        return False, not_ready
            else:
                self.create_fs_glm_df()
                return True, list()
        else:
            print('    no ids found')
            return False, list()

    def chk_glm_files(self, bids_id):
        '''it is expected that the BIDS IDs are located in FREESURFER -> SUBJECTS_DIR
            script checks if subjects are present
        Args:
            bids_id: ID of the subject to chk
        Return:
            populates list of missing subjects
            populates dict with ids
        '''
        files_not_ok = fs_definitions.ChkFSQcache(self.FS_SUBJECTS_DIR,
                                                  bids_id, self.vars_fs).miss
        if files_not_ok:
            for file in files_not_ok[bids_id]:
                self.add_to_miss(bids_id, file)
            return False
        else:
            return True

    def create_fs_glm_df(self):
        self.rm_missing_ids()
        tmp_id = 'fs_id'
        print('    creating the glm file for FreeSurfer GLM analysis')
        d_ids = {
            self.proj_vars['id_col']:
            [i for i in list(self.ids_4fs_glm.keys())],
            tmp_id: [i for i in list(self.ids_4fs_glm.values())]
        }
        fs_proc_df = self.tab.create_df_from_dict(d_ids)
        fs_proc_df = self.tab.change_index(fs_proc_df,
                                           self.proj_vars['id_col'])
        grid_fs_df_pre = self.tab.change_index(self.df,
                                               self.proj_vars['id_col'])
        self.df_ids = self.tab.join_dfs(grid_fs_df_pre,
                                        fs_proc_df,
                                        how='outer')
        self.df_ids.rename(columns={tmp_id: self.proj_vars['id_col']},
                           inplace=True)
        self.df_ids = self.tab.change_index(self.df_ids,
                                            self.proj_vars['id_col'])
        self.tab.save_df(self.df_ids, self.f_GLM_group)
        PrepareForGLM(self.FS_SUBJECTS_DIR, self.FS_GLM_dir, self.f_GLM_group,
                      self.proj_vars, self.vars_fs)

    def rm_missing_ids(self):
        ls_ix_2rm = list()
        for ix in self.df.index:
            bids_id = self.df.at[ix, self.proj_vars['id_col']]
            if bids_id not in self.ids_4fs_glm.keys():
                ls_ix_2rm.append(ix)
        len_miss = len(ls_ix_2rm)
        if len_miss == 0:
            print(f'        ALL subjects are present')
        else:
            print(f'        {len_miss} subjects are missing')
            print(f'            they will be removed from futher analysis')
        self.df = self.df.drop(ls_ix_2rm)

    def get_ids_processed(self):
        '''retrieves the bids names of the IDs provided in the GLM file.
            It is expected that each project had a group of subjects that are present in the dataset
            it is expected that BIDS names are the ones used in the groups_glm file for the ids
            the f_ids.json has the BIDS names of the subjects, and for each BIDS name
            has the corresponding names of the source file/freesurfer/nilearn/dipy processed ziped files
            see nimb/example/f_ids.json
        '''
        print('    extracting list of ids that were processed with FreeSurfer')
        print(f'        in the file{self.f_ids_processed}')
        self.ids_bids_proc_all = self.read_json(self.f_ids_processed)
        return {
            i: self.ids_bids_proc_all[i][DEFAULT.freesurfer_key]
            for i in self.ids_bids_proc_all
        }
        # return {i: 'path' for i in self.ids_bids_proc_all if self.ids_bids_proc_all[i]['source'] in ids_src_glm_file} #old version

    def add_to_miss(self, bids_id, file):
        '''add to the list of missing subjects
        '''
        if bids_id not in self.miss:
            self.miss[bids_id] = list()
        self.miss[bids_id].append(file)
        if bids_id in self.ids_4fs_glm:
            self.ids_4fs_glm.pop(bids_id, None)

    def read_json(self, f):
        '''read a json file
        '''
        with open(f, 'r') as jf:
            return json.load(jf)
Пример #4
0
def mkstatisticsf_Emmanuelle(
    df_4stats,
    groups,
    group_col,
):
    '''Creates discriptive statistical file for publication,
        based on provided pandas.DataFrame
        Works only on 2 groups
        author: version adjusted by Emmanuelle Mazur-Lainé 202206
    Args: df_4stats: pandas.DataFrame
        group: list/ tuple of groups as str/int
        group_col: column name in df_4stats that has the group names from group
        path_2save: abspath to save the descriptive files
        make_with_colors: will create an additional .xlsx file with 
                        colored significant results,
                        provided xlwt is installed
    Return:
        json file with results
        .csv file with results
        .xlsx file with results with red colored significant ones
    '''
    df_4stats = df_4stats.astype(float)

    tab = Table()

    groups_df = dict()
    for group in groups:
        groups_df[group] = tab.get_df_per_parameter(df_4stats, group_col,
                                                    group)

    stats_dic = dict()
    vals2chk = df_4stats.columns.tolist()

    if group_col in vals2chk:
        vals2chk.remove(group_col)

    cols2color_sig = list()
    groups = list(groups_df.keys())

    ################################
    if len(groups) == 1:
        ls_tests = ('mean', 'std', 'kurtosis', 'skewness')
    elif len(groups) <= 2:
        ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'TTest', 'Welch',
                    'MannWhitneyu')
    elif len(groups) > 2:
        ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'Bartlett',
                    'Kruskal', 'ANOVA')

    for test in ls_tests:
        for val in vals2chk:
            values_per_gr = []
            for i in range(0, len(groups)):
                gr_i = groups_df[groups[i]][val].values
                arr = np.array(gr_i)
                arr_without_nan = arr[np.logical_not(np.isnan(arr))]
                values_per_gr.append(arr_without_nan)

            results, params = get_stats_Emmanuelle(test, groups, values_per_gr)

            if test == 'mean':
                for i in range(len(groups)):
                    for tst in ('mean', 'std'):
                        results, params = get_stats_Emmanuelle(
                            tst, groups, values_per_gr)
                        key = f'{groups[i]}, {params}'
                        if key not in stats_dic:
                            stats_dic[key] = dict()
                        stats_dic[key][val] = f'{results[i]}'

            if test in ('kurtosis', 'skewness'):
                for i in range(len(groups)):
                    key = f'{groups[i]}, {params}'
                    if key not in stats_dic:
                        stats_dic[key] = dict()
                    stats_dic[key][val] = f'{results[i]}'

            elif test in ('TTest', 'Welch', 'Bartlett', 'MannWhitneyu',
                          'Kruskal', 'ANOVA'):
                for i in range(len(groups)):
                    key1 = f'{test}, {params[0]}'
                    key2 = f'{test}, {params[1]}'
                    for key in (key1, key2):
                        if key not in stats_dic:
                            stats_dic[key] = dict()
                    stats_dic[key1][val] = f'{results[0]}'
                    stats_dic[key2][val] = f'{results[1]}'

                cols2color_sig.append(key2)

    df = tab.create_df_from_dict(stats_dic)
    df = df.astype(float)

    # Creating new adjusted DataFrame with sub-indexes
    ls_tests_dup = []
    ls_param = []
    ls_keys = list(stats_dic.keys())

    mean_gr_done = False

    for test in ls_tests:
        if test in ('mean', 'std'):
            for i in range(0, len(groups)):
                ls_tests_dup.append('mean/std')
            if mean_gr_done == False:
                for i in range(len(groups)):
                    ls_param.append('gr' + str(i + 1) + ' (val=' +
                                    f'{groups[i]}' + ')')
                    ls_param.append('gr' + str(i + 1) + ' (val=' +
                                    f'{groups[i]}' + ')')
                    mean_gr_done = True

        elif test in ('kurtosis', 'skewness'):
            for i in range(0, len(groups)):
                ls_tests_dup.append(test)
                ls_param.append('gr' + str(i + 1) + ' (val=' + f'{groups[i]}' +
                                ')')

        elif test in ('TTest', 'Welch', 'Bartlett', 'MannWhitneyu', 'Kruskal',
                      'ANOVA'):
            ls_tests_dup.append(test)
            ls_tests_dup.append(test)

    for key in ls_keys[4 * (len(groups)):]:
        ls_param.append((str(key))[-1])

    col = [ls_tests_dup, ls_param]
    tuples = list(zip(*col))

    df_new = pd.DataFrame(df.values,
                          index=pd.Index(df.index),
                          columns=pd.MultiIndex.from_tuples(tuples))

    df_new = df_new.round(3)

    return df_new, stats_dic, cols2color_sig
Пример #5
0
def mkstatisticsf(df_4stats,
                  groups,
                  group_col,
                  path2save,
                  make_with_colors=True):
    '''Creates discriptive statistical file for publication,
        based on provided pandas.DataFrame
        Works only on 2 groups
    Args: df_4stats: pandas.DataFrame
        group: list/ tuple of groups as str/int
        group_col: str() column name in df_4stats that has the group names from group
        path_2save: abspath to save the descrptive files
        make_with_colors: will create an additional .xlsx file with 
                        colored significant results,
                        provided xlwt is installed
    Return:
        json file with results
        .csv file with results
        .xlsx file with results with red colored significant ones
    '''

    tab = Table()
    ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'TTest', 'Welch',
                'ANOVA', 'Bartlett', 'MannWhitneyu', 'Kruskal')

    groups_df = dict()
    for group in groups:
        groups_df[group] = tab.get_df_per_parameter(df_4stats, group_col,
                                                    group)

    stats_dic = dict()
    vals2chk = df_4stats.columns.tolist()
    if group_col in vals2chk:
        vals2chk.remove(group_col)

    cols2color_sig = list()
    groups = list(groups_df.keys())
    group1 = groups_df[groups[0]]
    group2 = groups_df[groups[1]]
    for test in ls_tests:
        for val in vals2chk:
            results, params = get_stats(test, group1[val], group2[val])
            if test in ('mean', 'std', 'kurtosis', 'skewness'):
                key1 = f'{groups[0]}, {params[0]}'
                key2 = f'{groups[1]}, {params[0]}'
            else:
                key1 = f'{test}, {params[0]}'
                key2 = f'{test}, {params[1]}'
                cols2color_sig.append(key2)
            for key in (key1, key2):
                if key not in stats_dic:
                    stats_dic[key] = dict()
            stats_dic[key1][val] = f'{results[0]}'
            stats_dic[key2][val] = f'{results[1]}'

    df = tab.create_df_from_dict(stats_dic)
    tab.save_df(df,
                os.path.join(path2save, 'stats_general.csv'),
                sheet_name='stats')
    utilities.save_json(stats_dic, os.path.join(path2save,
                                                'stats_general.json'))
    if make_with_colors:
        save_2xlsx_with_colors(df,
                               path2save=path2save,
                               cols2color_sig=cols2color_sig)