Пример #1
0
def transfer_scores(data, results, rotate='oblimin'):
    """ calculates factor scores in a new dataset based on a reference results object """
    ref_data = results.data
    EFA = results.EFA
    c = EFA.results['num_factors']
    loadings = EFA.get_loading(c=c, rotate=rotate)
    # transform data
    positive_skewed = [
        i.replace('.logTr', '') for i in ref_data.columns if ".logTr" in i
    ]
    negative_skewed = [
        i.replace('.ReflogTr', '') for i in ref_data.columns
        if ".ReflogTr" in i
    ]
    DVs = [
        i.replace('.logTr', '').replace('.ReflogTr', '')
        for i in ref_data.columns
    ]
    data = data.loc[:, DVs]
    print('using correct transfer_scores')
    data = transform_remove_skew(data,
                                 positive_skewed=positive_skewed,
                                 negative_skewed=negative_skewed,
                                 drop_failed=False)
    data = remove_outliers(data)
    data_imputed, error = missForest(data)
    subset = data_imputed.loc[:, loadings.index]
    scaled_data = scale(subset)
    # calculate scores
    weights = get_attr(EFA.results['factor_tree_Rout_%s' % rotate][c],
                       'weights')
    scores = pd.DataFrame(scaled_data.dot(weights),
                          index=data_imputed.index,
                          columns=loadings.columns)
    return scores
def get_retest_comparison_data():
    subsets = ['meaningful_variables_noDDM.csv', 'meaningful_variables_EZ.csv',
               'meaningful_variables_hddm.csv']
    dataset = pd.DataFrame()
    for subset in subsets:
        df = get_behav_data(file=subset)
        df_clean = remove_outliers(df)
        df_clean = transform_remove_skew(df_clean)
        drop_columns = set(dataset) & set(df_clean)
        df_clean.drop(labels=drop_columns, axis=1, inplace=True)
        dataset = pd.concat([dataset, df_clean], axis=1)
    return dataset
def calc_EFA_retest_held_out(results, rotate='oblimin', verbose=True):
    name = results.ID.split('_')[0].title()
    orig_data = results.data
    positive_skewed = [i.replace('.logTr', '') for i in orig_data.columns if ".logTr" in i]
    negative_skewed = [i.replace('.ReflogTr', '') for i in orig_data.columns if ".ReflogTr" in i]
    DVs = [i.replace('.logTr','').replace('.ReflogTr','') for i in orig_data.columns]
    orig_scores = results.EFA.get_scores(rotate=rotate)
    
    # load and clean retest data exactly like original data
    data_raw = get_behav_data(dataset=results.dataset,
                              file='meaningful_variables.csv')
    retest_data_raw = get_behav_data(dataset=results.dataset.replace('Complete','Retest'),
                                     file='meaningful_variables.csv')
    shared_ids = set(retest_data_raw.index) & set(data_raw.index)
    data_raw = data_raw.loc[shared_ids, :]
    retest_data_raw = retest_data_raw.loc[shared_ids, :]
    raw_data = {'T1': data_raw, 'T2': retest_data_raw}
    imputed_data = {}
    for name, data in raw_data.items():  
        tmp_data = data.loc[:, DVs]
        tmp_data = transform_remove_skew(tmp_data,
                                         positive_skewed=positive_skewed,
                                         negative_skewed=negative_skewed)
        tmp_data = remove_outliers(tmp_data)
        tmp_data_imputed, error = missForest(tmp_data)  
        scaled_tmp_data = scale(tmp_data_imputed)
        imputed_data[name] = scaled_tmp_data
    
    # get subjects not in the retest set
    ind_data = orig_data.loc[set(orig_data.index)-shared_ids]
    fa, output = psychFA(ind_data, results.EFA.results['num_factors'], 
                         method='ml', rotate=rotate)
    weights = get_attr(fa, 'weights')
    scores = {}
    for name, data in imputed_data.items():
        suffix=''
        if name=='T2': suffix='T2'
        tmp_scores = pd.DataFrame(data.dot(weights),
                                  index=shared_ids,
                                  columns=[i+' '+suffix for i in orig_scores.columns])
        scores[name] = tmp_scores
    combined = pd.concat([scores['T1'], scores['T2']], axis=1)
    cross_diag = [combined.corr().iloc[i,i+len(orig_scores.columns)] 
                    for i in range(len(orig_scores.columns))]
        # get ICCs
    ICCs = []
    for col in scores['T1'].columns:
        tmp = combined.filter(regex=col)
        out = psych.ICC(tmp)
        ICCs.append(list(out[0][1])[-1])
    return combined, cross_diag, ICCs, (fa, output)
Пример #4
0
def get_retest_comparison_data():
    subsets = [
        'meaningful_variables_noDDM.csv', 'meaningful_variables_EZ.csv',
        'meaningful_variables_hddm.csv'
    ]
    dataset = pd.DataFrame()
    for subset in subsets:
        df = get_behav_data(file=subset)
        df_clean = remove_outliers(df)
        df_clean = transform_remove_skew(df_clean)
        drop_columns = set(dataset) & set(df_clean)
        df_clean.drop(labels=drop_columns, axis=1, inplace=True)
        dataset = pd.concat([dataset, df_clean], axis=1)
    return dataset
def transfer_scores(data, results, rotate='oblimin'):
    """ calculates factor scores in a new dataset based on a reference results object """
    ref_data = results.data
    EFA = results.EFA
    c = EFA.results['num_factors']
    loadings = EFA.get_loading(c=c, rotate=rotate)
    # transform data
    positive_skewed = [i.replace('.logTr', '') for i in ref_data.columns if ".logTr" in i]
    negative_skewed = [i.replace('.ReflogTr', '') for i in ref_data.columns if ".ReflogTr" in i]
    DVs = [i.replace('.logTr','').replace('.ReflogTr','') for i in ref_data.columns]
    data = data.loc[:, DVs]
    data = transform_remove_skew(data,
                                 positive_skewed=positive_skewed,
                                 negative_skewed=negative_skewed)
    data = remove_outliers(data)
    data_imputed, error = missForest(data)
    subset = data_imputed.loc[:, loadings.index]
    scaled_data = scale(subset)
    # calculate scores
    weights = get_attr(EFA.results['factor_tree_Rout_%s' % rotate][c], 'weights')
    scores = pd.DataFrame(scaled_data.dot(weights),
                          index=data_imputed.index,
                          columns=loadings.columns)
    return scores
def calc_EFA_retest_held_out(results, rotate='oblimin', verbose=True):
    name = results.ID.split('_')[0].title()
    orig_data = results.data
    positive_skewed = [
        i.replace('.logTr', '') for i in orig_data.columns if ".logTr" in i
    ]
    negative_skewed = [
        i.replace('.ReflogTr', '') for i in orig_data.columns
        if ".ReflogTr" in i
    ]
    DVs = [
        i.replace('.logTr', '').replace('.ReflogTr', '')
        for i in orig_data.columns
    ]
    orig_scores = results.EFA.get_scores(rotate=rotate)

    # load and clean retest data exactly like original data
    data_raw = get_behav_data(dataset=results.dataset,
                              file='meaningful_variables.csv')
    retest_data_raw = get_behav_data(dataset=results.dataset.replace(
        'Complete', 'Retest'),
                                     file='meaningful_variables.csv')
    shared_ids = set(retest_data_raw.index) & set(data_raw.index)
    data_raw = data_raw.loc[shared_ids, :]
    retest_data_raw = retest_data_raw.loc[shared_ids, :]
    raw_data = {'T1': data_raw, 'T2': retest_data_raw}
    imputed_data = {}
    for name, data in raw_data.items():
        tmp_data = data.loc[:, DVs]
        tmp_data = transform_remove_skew(tmp_data,
                                         positive_skewed=positive_skewed,
                                         negative_skewed=negative_skewed)
        tmp_data = remove_outliers(tmp_data)
        tmp_data_imputed, error = missForest(tmp_data)
        scaled_tmp_data = scale(tmp_data_imputed)
        imputed_data[name] = scaled_tmp_data

    # get subjects not in the retest set
    ind_data = orig_data.loc[set(orig_data.index) - shared_ids]
    fa, output = psychFA(ind_data,
                         results.EFA.results['num_factors'],
                         method='ml',
                         rotate=rotate)
    weights = get_attr(fa, 'weights')
    scores = {}
    for name, data in imputed_data.items():
        suffix = ''
        if name == 'T2': suffix = 'T2'
        tmp_scores = pd.DataFrame(
            data.dot(weights),
            index=shared_ids,
            columns=[i + ' ' + suffix for i in orig_scores.columns])
        scores[name] = tmp_scores
    combined = pd.concat([scores['T1'], scores['T2']], axis=1)
    cross_diag = [
        combined.corr().iloc[i, i + len(orig_scores.columns)]
        for i in range(len(orig_scores.columns))
    ]
    # get ICCs
    ICCs = []
    for col in scores['T1'].columns:
        tmp = combined.filter(regex=col)
        out = psych.ICC(tmp)
        ICCs.append(list(out[0][1])[-1])
    return combined, cross_diag, ICCs, (fa, output)
Пример #7
0
        EZ_subset.to_csv(path.join(directory, 'meaningful_variables_EZ.csv'))
        readme_lines += ["meaningful_variables_EZ.csv: subset of exhaustive data to only meaningful variables with rt/acc parameters removed (replaced by EZ DDM params)\n\n"]
        # make subset without acc/rt vars and just hddm DDM
        hddm_subset = drop_vars(subset, drop_vars = ['_acc', '_rt', 'EZ'], saved_vars = ['simple_reaction_time.avg_rt', 'dospert_rt_survey'])
        hddm_subset.to_csv(path.join(directory, 'meaningful_variables_hddm.csv'))
        readme_lines += ["meaningful_variables_hddm.csv: subset of exhaustive data to only meaningful variables with rt/acc parameters removed (replaced by hddm DDM params)\n\n"]
        
        # save files that are selected for use
        # selected_variables = hddm_subset #OG
        selected_variables = noDDM_subset #HENRY 
        selected_variables.to_csv(path.join(directory, 'meaningful_variables.csv'))
        # readme_lines += ["meaningful_variables.csv: Same as meaningful_variables_hddm.csv\n\n"] #OG
        readme_lines += ["meaningful_variables.csv: Same as meaningful_variables_noDDM.csv\n\n Used for Replication Study.\n\n"] #HENRY
        
        # clean data
        selected_variables_clean = transform_remove_skew(selected_variables)
        selected_variables_clean = remove_outliers(selected_variables_clean)
        selected_variables_clean = remove_correlated_task_variables(selected_variables_clean)
        selected_variables_clean.to_csv(path.join(directory, 'meaningful_variables_clean.csv'))
        readme_lines += ["meaningful_variables_clean.csv: same as meaningful_variables.csv with skewed variables transformed and then outliers removed \n\n"]
        
        # imputed data
        selected_variables_imputed, error = missForest(selected_variables_clean)
        selected_variables_imputed.to_csv(path.join(directory, 'meaningful_variables_imputed.csv'))
        readme_lines += ["meaningful_variables_imputed.csv: meaningful_variables_clean.csv after imputation with missForest\n\n"]

        #save selected variables
        selected_variables_reference = valence_df
        selected_variables_reference.loc[selected_variables.columns].to_csv(path.join(reference_dir, 'selected_variables_reference.csv'))
                
        # save task data subset
        # make subset without acc/rt vars and just EZ DDM
        EZ_subset = drop_vars(subset, drop_vars = ['_acc', '_rt', 'hddm'], saved_vars = ['simple_reaction_time.avg_rt', 'dospert_rt_survey'])
        EZ_subset.to_csv(path.join(directory, 'meaningful_variables_EZ.csv'))
        readme_lines += ["meaningful_variables_EZ.csv: subset of exhaustive data to only meaningful variables with rt/acc parameters removed (replaced by EZ DDM params)\n\n"]
        # make subset without acc/rt vars and just hddm DDM
        hddm_subset = drop_vars(subset, drop_vars = ['_acc', '_rt', 'EZ'], saved_vars = ['simple_reaction_time.avg_rt', 'dospert_rt_survey'])
        hddm_subset.to_csv(path.join(directory, 'meaningful_variables_hddm.csv'))
        readme_lines += ["meaningful_variables_hddm.csv: subset of exhaustive data to only meaningful variables with rt/acc parameters removed (replaced by hddm DDM params)\n\n"]
        
        # save files that are selected for use
        selected_variables = hddm_subset
        selected_variables.to_csv(path.join(directory, 'meaningful_variables.csv'))
        readme_lines += ["meaningful_variables.csv: Same as meaningful_variables_hddm.csv\n\n"]
        
        # clean data
        selected_variables_clean = transform_remove_skew(selected_variables)
        selected_variables_clean = remove_outliers(selected_variables_clean)
        selected_variables_clean = remove_correlated_task_variables(selected_variables_clean)
        selected_variables_clean.to_csv(path.join(directory, 'meaningful_variables_clean.csv'))
        readme_lines += ["meaningful_variables_clean.csv: same as meaningful_variables.csv with skewed variables transformed and then outliers removed \n\n"]
        
        # imputed data
        selected_variables_imputed, error = missForest(selected_variables_clean)
        selected_variables_imputed.to_csv(path.join(directory, 'meaningful_variables_imputed.csv'))
        readme_lines += ["meaningful_variables_imputed.csv: meaningful_variables_clean.csv after imputation with missForest\n\n"]

        #save selected variables
        selected_variables_reference = valence_df
        selected_variables_reference.loc[selected_variables.columns].to_csv(path.join(reference_dir, 'selected_variables_reference.csv'))
                
        # save task data subset