def calc_EFA_retest_held_out(results, rotate='oblimin', verbose=True): name = results.ID.split('_')[0].title() orig_data = results.data positive_skewed = [i.replace('.logTr', '') for i in orig_data.columns if ".logTr" in i] negative_skewed = [i.replace('.ReflogTr', '') for i in orig_data.columns if ".ReflogTr" in i] DVs = [i.replace('.logTr','').replace('.ReflogTr','') for i in orig_data.columns] orig_scores = results.EFA.get_scores(rotate=rotate) # load and clean retest data exactly like original data data_raw = get_behav_data(dataset=results.dataset, file='meaningful_variables.csv') retest_data_raw = get_behav_data(dataset=results.dataset.replace('Complete','Retest'), file='meaningful_variables.csv') shared_ids = set(retest_data_raw.index) & set(data_raw.index) data_raw = data_raw.loc[shared_ids, :] retest_data_raw = retest_data_raw.loc[shared_ids, :] raw_data = {'T1': data_raw, 'T2': retest_data_raw} imputed_data = {} for name, data in raw_data.items(): tmp_data = data.loc[:, DVs] tmp_data = transform_remove_skew(tmp_data, positive_skewed=positive_skewed, negative_skewed=negative_skewed) tmp_data = remove_outliers(tmp_data) tmp_data_imputed, error = missForest(tmp_data) scaled_tmp_data = scale(tmp_data_imputed) imputed_data[name] = scaled_tmp_data # get subjects not in the retest set ind_data = orig_data.loc[set(orig_data.index)-shared_ids] fa, output = psychFA(ind_data, results.EFA.results['num_factors'], method='ml', rotate=rotate) weights = get_attr(fa, 'weights') scores = {} for name, data in imputed_data.items(): suffix='' if name=='T2': suffix='T2' tmp_scores = pd.DataFrame(data.dot(weights), index=shared_ids, columns=[i+' '+suffix for i in orig_scores.columns]) scores[name] = tmp_scores combined = pd.concat([scores['T1'], scores['T2']], axis=1) cross_diag = [combined.corr().iloc[i,i+len(orig_scores.columns)] for i in range(len(orig_scores.columns))] # get ICCs ICCs = [] for col in scores['T1'].columns: tmp = combined.filter(regex=col) out = psych.ICC(tmp) ICCs.append(list(out[0][1])[-1]) return combined, cross_diag, ICCs, (fa, output)
def calc_EFA_retest(results, rotate='oblimin', verbose=True): name = results.ID.split('_')[0].title() retest_data_raw = get_behav_data(dataset=results.dataset.replace( 'Complete', 'Retest'), file='meaningful_variables.csv') shared_ids = set(retest_data_raw.index) & set(results.data.index) retest_data_raw = retest_data_raw.loc[shared_ids, :] retest_scores = transfer_scores(retest_data_raw, results, rotate=rotate) retest_scores.columns = [str(i) + ' Retest' for i in retest_scores.columns] # scale and perform the factor score transformation EFA = results.EFA c = EFA.get_c() ref_scores = EFA.get_scores(c=c, rotate=rotate).loc[retest_data_raw.index, :] # reorder scores if rotate == 'oblimin': reorder_vec = EFA.get_factor_reorder(c, rotate=rotate) ref_scores = ref_scores.iloc[:, reorder_vec] retest_scores = retest_scores.iloc[:, reorder_vec] combined = pd.concat([ref_scores, retest_scores], axis=1) cross_diag = np.diag(combined.corr().iloc[c:, :c]) # get ICCs ICCs = [] for col in ref_scores.columns: tmp = combined.filter(regex=str(col)) out = psych.ICC(tmp) ICCs.append(list(out[0][1])[-1]) if verbose: print('%s, Avg Correlation: %s\n' % (name, format_num(np.mean(cross_diag)))) for factor, num in zip(ref_scores.columns, cross_diag): print('%s: %s' % (factor, format_num(num))) return combined, cross_diag, ICCs
def calc_EFA_retest(results, rotate='oblimin', verbose=True): name = results.ID.split('_')[0].title() retest_data_raw = get_behav_data(dataset=results.dataset.replace('Complete','Retest'), file='meaningful_variables.csv') shared_ids = set(retest_data_raw.index) & set(results.data.index) retest_data_raw = retest_data_raw.loc[shared_ids, :] retest_scores = transfer_scores(retest_data_raw, results, rotate=rotate) retest_scores.columns = [str(i)+' Retest' for i in retest_scores.columns] # scale and perform the factor score transformation EFA = results.EFA c = EFA.get_c() ref_scores = EFA.get_scores(c=c, rotate=rotate).loc[retest_data_raw.index, :] # reorder scores if rotate == 'oblimin': reorder_vec = EFA.get_factor_reorder(c, rotate=rotate) ref_scores = ref_scores.iloc[:, reorder_vec] retest_scores = retest_scores.iloc[:, reorder_vec] combined = pd.concat([ref_scores, retest_scores], axis=1) cross_diag = np.diag(combined.corr().iloc[c:, :c]) # get ICCs ICCs = [] for col in ref_scores.columns: tmp = combined.filter(regex=str(col)) out = psych.ICC(tmp) ICCs.append(list(out[0][1])[-1]) if verbose: print('%s, Avg Correlation: %s\n' % (name, format_num(np.mean(cross_diag)))) for factor, num in zip(ref_scores.columns, cross_diag): print('%s: %s' % (factor, format_num(num))) return combined, cross_diag, ICCs
def assess_convergence(task, reps=5): # load data data = get_behav_data(file='Individual_Measures/%s.csv.gz' % task) data = data.query('worker_id in %s' % list(data.worker_id.unique()[0:20])) outputs = [] for _ in range(reps): output = run_model(task, data) outputs.append(output) return {task: outputs}
def assess_convergence(task, reps=5): # load data data = get_behav_data(file='Individual_Measures/%s.csv.gz' % task) data = data.query('worker_id in %s' % list(data.worker_id.unique()[0:20])) outputs = [] for _ in range(reps): output = run_model(task, data) outputs.append(output) return {task: outputs}
def get_retest_comparison_data(): subsets = ['meaningful_variables_noDDM.csv', 'meaningful_variables_EZ.csv', 'meaningful_variables_hddm.csv'] dataset = pd.DataFrame() for subset in subsets: df = get_behav_data(file=subset) df_clean = remove_outliers(df) df_clean = transform_remove_skew(df_clean) drop_columns = set(dataset) & set(df_clean) df_clean.drop(labels=drop_columns, axis=1, inplace=True) dataset = pd.concat([dataset, df_clean], axis=1) return dataset
def get_retest_comparison_data(): subsets = [ 'meaningful_variables_noDDM.csv', 'meaningful_variables_EZ.csv', 'meaningful_variables_hddm.csv' ] dataset = pd.DataFrame() for subset in subsets: df = get_behav_data(file=subset) df_clean = remove_outliers(df) df_clean = transform_remove_skew(df_clean) drop_columns = set(dataset) & set(df_clean) df_clean.drop(labels=drop_columns, axis=1, inplace=True) dataset = pd.concat([dataset, df_clean], axis=1) return dataset
import matplotlib.pyplot as plt from os import path import pandas as pd import seaborn as sns from selfregulation.utils.plot_utils import beautify_legend, format_num, format_variable_names from selfregulation.utils.utils import get_behav_data, get_demographics, get_info base_dir = get_info('base_directory') ext= 'png' data = get_behav_data() # ************************************************************************* # Successful replications # ************************************************************************ # two_stage two_stage_df = get_behav_data(file='Individual_Measures/two_stage_decision.csv.gz') # subset two subjects who passed quality control successful_two_stage = data.filter(regex='two_stage').dropna(how='any').index two_stage_df = two_stage_df.query('worker_id in %s' % list(successful_two_stage)) two_stage_df = two_stage_df.query('rt_first != -1 and feedback_last in [0,1]') colors = sns.hls_palette(2) plot_df = (1-two_stage_df.groupby(['worker_id','stage_transition_last','feedback_last']).switch.mean()).reset_index() plot_df.feedback_last = plot_df.feedback_last.replace({0:'Unrewarded', 1:'Rewarded'}) plot_df.stage_transition_last = \ plot_df.stage_transition_last.replace({'infrequent':'Rare', 'frequent':'Common'}) # shift shift_df = get_behav_data(file='Individual_Measures/shift_task.csv.gz') # subset two subjects who passed quality control successful_shift = data.filter(regex='shift').dropna(how='any').index shift_df = shift_df.query('worker_id in %s' % list(successful_shift)) shift_df = shift_df.query('rt != -1')
import pandas as pd import statsmodels.formula.api as smf from selfregulation.utils.utils import get_behav_data data = get_behav_data() problem_subj = data.filter(regex='simon').isnull().iloc[:, 0] df = get_behav_data(file='Individual_Measures/simon.csv.gz') df = df.query('exp_stage == "test" and rt==rt') params = {} for worker in df.worker_id.unique(): if problem_subj.loc[worker] == True: continue subset = df.query('worker_id=="%s"' % worker) acc_contrast = subset.groupby('condition').correct.mean() acc_diff = rt_contrast['incongruent'] - rt_contrast['congruent'] subset = subset.query('correct == True') rs = smf.ols(formula='rt ~ C(condition, Sum)', data=subset).fit() params[worker] = rs.params.tolist() params[worker][1] *= -2 rt_contrast = subset.groupby('condition').rt.median() diff = rt_contrast['incongruent'] - rt_contrast['congruent'] params[worker].append(diff) params[worker].append(acc_diff) DVs = pd.DataFrame(params, index=['Intercept', 'model_diff', 'diff', 'acc_diff']).T
else: d=[] for i in subscale_data.loc[v]: try: d.append(str(int(i))) except: pass subscale_var_dict[v]=':'.join(d) return subscale_var_dict subscale_var_dict=get_subscale_vars() # first get variable-level metadata # based on variable set in meaningful_variables behavdata=get_behav_data(dataset) measures={} for c in list(behavdata.columns): c_s=c.split('.') m=c_s[0] v='.'.join(c_s[1:]) if not m in measures: measures[m]={'dataElements':[]} measures[m]['dataElements'].append(v) metadata={} # three entries are: class, type, and whether we are looking at beginning # of string - this lets us find differences task_vars=[('hddm_drift','DDMDriftRate','rate',True), ("hddm_non_decision",'DDMNondecisionTime','seconds',True), ('hddm_thresh','DDMThreshold','other',True),
label=name, ax=ax1, scatter_kws={ 's': 100, 'alpha': .4 }) ax1.legend() sns.boxplot('split_time', 'value', hue='variable', data=melted, ax=ax2) if title: plt.suptitle(title, fontsize=18) plt.show() verbose = True # load data behav_data = get_behav_data(file='meaningful_variables_imputed.csv') measures = np.unique([i.split('.')[0] for i in behav_data.columns]) time_effects = {} for measure_name in measures[0:10]: measure = get_behav_data(file='Individual_Measures/%s.csv.gz' % measure_name) measure_DVs = behav_data.filter(regex=measure_name) measure_DVs.columns = [i.split('.')[1] for i in measure_DVs.columns] # scale measure_DVs = pd.DataFrame(scale(measure_DVs), index=measure_DVs.index, columns=measure_DVs.columns) finishtimes = measure.groupby('worker_id').finishtime.apply( lambda x: np.unique(x)[0])
out = clf.coef_ if len(out.shape)==1: out = out.reshape(1,-1) out = pd.DataFrame(out, columns=scores.columns) out.index = data.columns return out # do mapping dataset = get_recent_dataset() # load ontology results = load_results(datafile=dataset) task_loadings = results['task'].EFA.get_loading() task_scores = results['task'].EFA.get_scores() # load all DVs all_DVs = get_behav_data(file='variables_exhaustive.csv', dataset=dataset) contrast_loadings = {} for contrast, relation in fmri_ontology_mapping.items(): # if relation is in the direct mapping if relation.lstrip('-') in task_loadings.index: task_loading = task_loadings.loc[relation.lstrip('-')] if relation[0] == '-': task_loading = task_loading*-1 # otherwise, reconstruct! else: unmapped_data = all_DVs.loc[:,relation.lstrip('-')] missing = unmapped_data[unmapped_data.isnull()].index task_loading = run_linear(pd.DataFrame(unmapped_data.drop(missing)),
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Apr 16 20:13:32 2017 @author: ian """ import glob import pandas as pd from selfregulation.utils.utils import get_behav_data df = get_behav_data(file='demographic_health.csv', full_dataset=True) failed_dataset = sorted(glob.glob('../Data/Failed*'))[0] failed_subjects = get_behav_data(dataset=failed_dataset, file='demographic_health.csv') fmri_dataset = sorted(glob.glob('../Data/Fmri*'))[0] fmri_subjects = get_behav_data(dataset=fmri_dataset, file='demographic_health.csv') fmri_subjects.index = ['fmri_'+i for i in fmri_subjects.index] all_subjects = pd.concat([df,failed_subjects,fmri_subjects]) # total of 662 workers in mturk sample worker_counts = pd.read_json('../Data/Local/worker_counts.json', typ='series') total_workers = len(worker_counts)+len(fmri_subjects) # separate into groups
import matplotlib.pyplot as plt import numpy as np from os import path import pandas as pd import seaborn as sns from selfregulation.utils.plot_utils import beautify_legend from selfregulation.utils.utils import get_behav_data, get_info base_dir = get_info('base_directory') ext = 'png' data = get_behav_data() # our data dpx = get_behav_data(file='Individual_Measures/dot_pattern_expectancy.csv.gz') dpx = dpx.query('exp_stage != "practice"') N = len(dpx.worker_id.unique()) acc = 1 - dpx.query('rt!=-1').groupby(['worker_id', 'condition' ]).correct.mean() acc_stats = acc.groupby('condition').agg(["mean", "std"]) rt = dpx.groupby(['worker_id', 'condition']).rt.mean() rt_stats = rt.groupby('condition').agg(["mean", "std"]) # literature data #replicate "The neural circuitry supporting goal maintenance during cognitive control: a comparison of expectancy AX-CPT and dot probe expectancy paradigms" literature_data = { 'acc': { 'mean': { 'AX': .98, 'AY': .84, 'BX': .92,
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Jul 10 18:47:50 2017 @author: ian """ import pickle from expanalysis.experiments.processing import calc_exp_DVs import pandas as pd from selfregulation.utils.utils import get_behav_data df = get_behav_data(file = 'Individual_Measures/two_stage_decision.csv.gz') workers = list(df.worker_id.unique()) df = df.query('worker_id in %s' % workers) DV_tests = [] for repeats in range(100): print(repeats) DVs, valence, description = calc_exp_DVs(df) DVs.columns = [i + '_run%s' % str(repeats) for i in DVs.columns] DV_tests.append(DVs) DV_tests = pd.concat(DV_tests, axis=1) DV_tests.to_pickle('two_stage_tests.pkl') N = len(DV_tests.columns) corr = DV_tests.corr() DV_reliabilities = {} for c in range(5):
# -*- coding: utf-8 -*- """ Created on Wed Nov 23 14:56:35 2016 @author: ian """ from math import floor, ceil import numpy import pandas as pd from selfregulation.utils.utils import get_behav_data df = get_behav_data(dataset = 'Discovery_11-20-2016', file = 'Individual_Measures/stop_signal.csv.gz') worker = 0 df = df.query('worker_id == "%s"' % df.worker_id.unique()[worker]) #remove practice df = df.query('exp_stage not in ["practice","NoSS_practice"]').reset_index(drop = True) dvs = {} # Calculate SSRT for both conditions for c in df.condition.unique(): c_df = df[df.condition == c] #SSRT go_trials = c_df.query('SS_trial_type == "go"') stop_trials = c_df.query('SS_trial_type == "stop"') sorted_go = go_trials.query('rt != -1').rt.sort_values(ascending = True) prob_stop_failure = (1-stop_trials.stopped.mean())
from selfregulation.utils.utils import get_behav_data, get_info import bct import igraph import numpy as np from os.path import join, exists from os import makedirs import pandas as pd import seaborn as sns # generic variables save_plots = False plot_dir = join(get_info('base_directory'),'dimensional_structure','Plots') # get dependent variables graph_data = get_behav_data(file = 'taskdata_imputed.csv') def run_graph_analysis(adj_dict, save_plots=False): """ Takes in a dictionary with two keys: "name" and "adj", specifying an adjacency matrix (as a dataframe) and its corresponding name """ def plot_name(name): return join(plot_dir,adj_name,name) adj_name = adj_dict['name'] adj = adj_dict['adj'] # if saving plots, make sure directory exists if save_plots:
from expanalysis.experiments.psychological_models import MoE_Model import numpy as np import pandas as pd from selfregulation.utils.utils import get_behav_data data = get_behav_data(file='Individual_Measures/hierarchical_rule.csv.gz') workers = data.worker_id.unique() data = data.query("worker_id == '%s'" % workers[0]) from scipy.optimize import minimize def eval_MoE(fit_args, passed_args): args = { 'kappa': fit_args[0], 'zeta': fit_args[1], 'xi': fit_args[2], 'beta2': fit_args[3], 'beta3': fit_args[4], 'alphaC': fit_args[5], 'alphaO': fit_args[6], 'alphaS': fit_args[7], 'beta_hierarchy': fit_args[8], 'data': data, } MoE_model = MoE_Model(**args) likelihoods = [] for i, trial in data.iterrows(): if trial.key_press != -1: action_probs = MoE_model.get_action_probs(trial)
for _ in range(n - len(subset))] subset = list(subset) + new_indices subset = pd.unique(subset) return list(subset) def impute(data, method): sigma = data.std() matrix = (data / sigma).as_matrix() complete_matrix = method().fit_transform(matrix) * sigma.tolist() return pd.DataFrame(complete_matrix, index=data.index, columns=data.columns) DV_df = get_behav_data('Discovery_9-26-16', use_EZ=True) sigma = DV_df.std() base_matrix = (DV_df / sigma).as_matrix() # test different imputation methods methods = [fancyimpute.SoftImpute, fancyimpute.IterativeSVD, fancyimpute.KNN] correlations = {} percent_off = {} for method in methods: print('using %s' % method) correlations[method] = [] percent_off[method] = [] for simulation in range(20): indices = get_rand_index(base_matrix, 1000) originals = [base_matrix[i] for i in indices]
from selfregulation.utils.utils import get_behav_data, get_info import bct import igraph import numpy as np from os.path import join, exists from os import makedirs import pandas as pd import seaborn as sns # generic variables save_plots = False plot_dir = join(get_info('base_directory'), 'dimensional_structure', 'Plots') # get dependent variables graph_data = get_behav_data(file='taskdata_imputed.csv') def run_graph_analysis(adj_dict, save_plots=False): """ Takes in a dictionary with two keys: "name" and "adj", specifying an adjacency matrix (as a dataframe) and its corresponding name """ def plot_name(name): return join(plot_dir, adj_name, name) adj_name = adj_dict['name'] adj = adj_dict['adj'] # if saving plots, make sure directory exists if save_plots: makedirs(join(plot_dir, adj_name), exist_ok=True)
subset = [(randint(0,M.shape[0]-1), randint(0,M.shape[1]-1)) for _ in range(n)] subset = pd.unique(subset) while len(subset) < n: new_indices = [(randint(0,M.shape[0]-1), randint(0,M.shape[1]-1)) for _ in range(n-len(subset))] subset = list(subset) + new_indices subset = pd.unique(subset) return list(subset) def impute(data, method): sigma = data.std() matrix = (data/sigma).as_matrix() complete_matrix = method().fit_transform(matrix)*sigma.tolist() return pd.DataFrame(complete_matrix, index = data.index, columns = data.columns) DV_df = get_behav_data('Discovery_9-26-16', use_EZ = True) sigma = DV_df.std() base_matrix = (DV_df/sigma).as_matrix() # test different imputation methods methods = [fancyimpute.SoftImpute, fancyimpute.IterativeSVD, fancyimpute.KNN] correlations = {} percent_off = {} for method in methods: print('using %s' % method) correlations[method] = [] percent_off[method] = [] for simulation in range(20): indices = get_rand_index(base_matrix,1000) originals = [base_matrix[i] for i in indices]
import os import numpy as np from selfregulation.utils.utils import get_behav_data import seaborn as sns #Make Plot Directy if it doesn't exist if not os.path.exists('Plots'): os.mkdir('Plots') # get DV df DV_df = get_behav_data() tasks = np.unique(DV_df.columns.map(lambda x: x.split('.')[0])) for task in tasks: subset = DV_df.filter(regex='^%s' % task) subset = subset.dropna(how='all').dropna(axis=1) sns.set(font_scale=1.5) p = sns.pairplot(subset, kind='reg', size=5, diag_kws={'bins': 50}) p.savefig('Plots/%s_pair_plot.pdf' % task, dpi=300)
# parse args run_factors = not args.skip_factors run_raw = not args.run_raw classifier = args.classifier raw_classifier = args.raw_classifier shuffle_reps = args.shuffle_repeats EFA_rotation = args.EFA_rotation results_dir = path.join(get_info('results_directory'), 'ideology_prediction') makedirs(results_dir, exist_ok=True) # load data dataset = get_recent_dataset() results = load_results(dataset) ideo_data = get_behav_data(dataset, file='ideology.csv') # get demographics ideo_demographics = get_behav_data(dataset, file='ideology_demographics.csv') # fill in ideo demographics from demographics if needed demographics = get_demographics() # fill gender missing_gender = ideo_demographics[ideo_demographics['Gender'].isnull()].index ideo_demographics.loc[missing_gender, 'Gender'] = demographics.loc[missing_gender, 'Sex'] # Age can be off by a year potentially by the time the ideological data was collected missing_age = ideo_demographics[ideo_demographics['Age'].isnull()].index ideo_demographics.loc[missing_age, 'Age'] = demographics.loc[missing_age, 'Age'] # reduce dataset to where we have full demographics ideo_demographics = ideo_demographics[ideo_demographics.isnull().sum(1)==0] ideo_data = ideo_data.loc[ideo_demographics.index]
for i,r in data.iterrows(): itemoptions=eval(r.options) item_ids.append('_'.join(itemoptions[0]['id'].split('_')[:-1])) data['item_id']=item_ids return data def save_data(data,survey_metadata, outdir=os.path.join(outdir,'surveydata')): if not os.path.exists(outdir): os.mkdir(outdir) for k in survey_metadata.keys(): matchdata=data.query("survey=='%s'"%k) unique_items=list(matchdata.item_id.unique()) surveydata=pandas.DataFrame({'worker':list(matchdata.worker.unique())}) for i in unique_items: matchitem=matchdata.query('item_id=="%s"'%i) matchitem=pandas.DataFrame({'worker':matchitem.worker,i:matchitem.coded_response}) surveydata=surveydata.merge(matchitem,on='worker') surveydata.to_csv(os.path.join(outdir,'%s.tsv'%k),sep='\t',index=False) return outdir if __name__=='__main__': data=get_behav_data(file='items.csv.gz') survey_items=get_survey_items(data) survey_metadata,metadatdir=save_metadata(survey_items) #data=add_survey_item_labels(data) #datadir=save_data(data,survey_metadata) pickle.dump(survey_metadata,open(os.path.join(outdir,'survey_metadata.pkl'),'wb'))
import math import matplotlib.pyplot as plt import numpy as np from os import path import pandas as pd import seaborn as sns from selfregulation.utils.plot_utils import beautify_legend, format_num, format_variable_names from selfregulation.utils.utils import filter_behav_data, get_behav_data, get_demographics, get_info # correlation of ravens and literature # replication of "Intelligence and socioeconomic success: A meta-analytic # review of longitudinal research" base_dir = get_info('base_directory') ext= 'png' data = get_behav_data() demographics = get_demographics() data = data.loc[demographics.index] # get dataframe of intelligence measure (raven's progressive matrices) and demographics) df = pd.concat([data.filter(regex='raven'), demographics], axis=1) # get raven's reliability reliability = get_behav_data(dataset='Retest_02-03-2018', file='bootstrap_merged.csv.gz') raven_reliability = reliability.groupby('dv').icc.mean().filter(regex='raven')[0] # demographic reliabilities demo_reliabilities = [1.0]*demographics.shape[1] # correlations correlations = df.corr().filter(regex='raven').sort_values(by='ravens.score').iloc[:-1] correlations.insert(0, 'target_reliability', demo_reliabilities) adjusted = correlations['ravens.score']/(raven_reliability*correlations['target_reliability'])**.5
:return dv: dictionary of dependent variables :return description: descriptor of DVs """ df = df[~ pandas.isnull(df['taste_diff'])].reset_index(drop = True) df = df.query('mouse_click != "-1"') rs = smf.ols(formula = 'coded_response ~ health_diff + taste_diff', data = df).fit() dvs['health_sensitivity'] = {'value': rs.params['health_diff'], 'valence': 'Pos'} dvs['taste_sensitivity'] = {'value': rs.params['taste_diff'], 'valence': 'Neg'} description = """ Both taste and health sensitivity are calculated based on the decision phase. On each trial the participant indicates whether they would prefer a food option over a reference food. Their choice is regressed on the subjective health and taste difference between that option and the reference item. Positive values indicate that the option's higher health/taste relates to choosing the option more often """ return dvs,description # get data df = get_behav_data(dataset = 'Discovery_11-20-2016', file = 'Individual_Measures/dietary_decision.csv.gz') demo = get_behav_data(dataset = 'Discovery_11-20-2016', file = 'demographic_targets.csv') # calc DVs DVs, description = calc_dietary_decision_DV(df) for key,val in DVs.items(): for subj_key in val.keys(): val[subj_key]=val[subj_key]['value'] DVs = pandas.DataFrame.from_dict(DVs).T # do it the simpler way DV, valence, description = calc_exp_DVs(df)
return (time.hour-5)%24 # convert from GMT to CST def plot_time_effects(measure_DVs, melted_DVs, title=None): f, (ax1,ax2) = plt.subplots(1, 2, figsize=(16,8)) for name in measure_DVs.columns[:-2]: sns.regplot('hour', name, data=measure_DVs, lowess=True, label=name, ax=ax1, scatter_kws={'s': 100, 'alpha': .4}) ax1.legend() sns.boxplot('split_time', 'value', hue='variable', data=melted, ax=ax2) if title: plt.suptitle(title, fontsize=18) plt.show() verbose=True # load data behav_data = get_behav_data(file='meaningful_variables_imputed.csv') measures = np.unique([i.split('.')[0] for i in behav_data.columns]) time_effects = {} for measure_name in measures[0:10]: measure = get_behav_data(file='Individual_Measures/%s.csv.gz' % measure_name) measure_DVs = behav_data.filter(regex=measure_name) measure_DVs.columns = [i.split('.')[1] for i in measure_DVs.columns] # scale measure_DVs = pd.DataFrame(scale(measure_DVs), index=measure_DVs.index, columns=measure_DVs.columns) finishtimes = measure.groupby('worker_id').finishtime.apply(lambda x: np.unique(x)[0]) daytime = finishtimes.apply(convert_to_time) daytime.name='hour' measure_DVs = pd.concat([measure_DVs, daytime], axis=1) # add on time split in half and melt
def __init__(self, datafile=None, loading_thresh=None, dist_metric=distcorr, boot_iter=1000, name='', filter_regex='.', ID=None, results_dir=None, residualize_vars=['Age', 'Sex'], saved_obj_file=None ): """ Args: datafile: name of a directory in "Data" loading_thresh: threshold to use for factor analytic result dist_metric: distance metric for hierarchical clustering that is passed to pdist name: string to append to ID, default to empty string filter_regex: regex string passed to data.filter ID: specify if a specific ID is desired results_dir: where to save results """ assert datafile is not None or saved_obj_file is not None # initialize with the saved object if available if saved_obj_file: self._load_init(saved_obj_file) else: # set vars self.dataset = datafile self.loading_thresh = None self.dist_metric = dist_metric self.boot_iter = boot_iter self.residualize_vars = residualize_vars if ID is None: self.ID = '%s_%s' % (name, str(random.getrandbits(16))) else: self.ID = '%s_%s' % (name, str(ID)) # set up output files self.results_dir = results_dir # load data self.data = get_behav_data(dataset=datafile, file='meaningful_variables_imputed.csv', filter_regex=filter_regex, verbose=True) self.data_no_impute = get_behav_data(dataset=datafile, file='meaningful_variables_clean.csv', filter_regex=filter_regex, verbose=True) self.demographics = get_demographics() # initialize analysis classes self.DA = Demographic_Analysis(self.demographics, residualize_vars=self.residualize_vars, boot_iter=self.boot_iter) self.EFA = EFA_Analysis(self.data, self.data_no_impute, boot_iter=self.boot_iter) self.HCA = HCA_Analysis(dist_metric=self.dist_metric) # load the results from the saved object if saved_obj_file: self._load_results(saved_obj_file)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Apr 16 20:13:32 2017 @author: ian """ import glob import pandas as pd from selfregulation.utils.utils import get_behav_data df = get_behav_data(file='demographic_health.csv', full_dataset=True) failed_dataset = sorted(glob.glob('../Data/Failed*'))[0] failed_subjects = get_behav_data(dataset=failed_dataset, file='demographic_health.csv') fmri_dataset = sorted(glob.glob('../Data/Fmri*'))[0] fmri_subjects = get_behav_data(dataset=fmri_dataset, file='demographic_health.csv') fmri_subjects.index = ['fmri_' + i for i in fmri_subjects.index] all_subjects = pd.concat([df, failed_subjects, fmri_subjects]) # total of 662 workers in mturk sample worker_counts = pd.read_json('../Data/Local/worker_counts.json', typ='series') total_workers = len(worker_counts) + len(fmri_subjects) # separate into groups groups = all_subjects.groupby(['HispanicLatino', 'Sex', 'Race']).Age.count()
from selfregulation.utils.utils import get_info,get_behav_data basedir=get_info('base_directory') #dataset=get_info('dataset') if usefull: print('using full dataset') derived_dir=os.path.join(basedir,'Data/Derived_Data/%s'%dataset.replace('Discovery','Combined').replace('Validation','Combined')) else: print('using dataset:',dataset) derived_dir=os.path.join(basedir,'Data/Derived_Data/%s'%dataset) datadir=os.path.join(basedir,'data/%s'%dataset) if not os.path.exists(derived_dir): os.makedirs(derived_dir) print('saving to',derived_dir) data=get_behav_data(file='subject_x_items.csv',full_dataset=usefull) maxnans=5 fixdata=data.copy() dropped={} fixed={} for c in data.columns: f,dropflag=cleanup_item_dist(c,fixdata,verbose=False,minresp=min_freq) fixdata[c]=f u,h=get_respdist(f) if numpy.sum(numpy.isnan(data[c]),0)>maxnans: print('dropping %s due to too many NaNs'%c) dropflag=True if dropflag:
from expanalysis.experiments.psychological_models import fRL_Model import numpy as np import pandas as pd from selfregulation.utils.utils import get_behav_data data = get_behav_data(file='Individual_Measures/shift_task.csv.gz') workers = data.worker_id.unique() # test divergence between hierarchical and flat experts after training models = [] for worker in workers[0:5]: df = data.query("worker_id == '%s'" % worker) model = fRL_Model(df, decay_weights=True, verbose=True) model.optimize() models.append(model)
import matplotlib.pyplot as plt from os import path import pandas as pd import seaborn as sns from selfregulation.utils.plot_utils import beautify_legend, format_num, format_variable_names from selfregulation.utils.utils import get_behav_data, get_demographics, get_info base_dir = get_info('base_directory') ext = 'png' data = get_behav_data() # ************************************************************************* # Successful replications # ************************************************************************ # two_stage two_stage_df = get_behav_data( file='Individual_Measures/two_stage_decision.csv.gz') # subset two subjects who passed quality control successful_two_stage = data.filter(regex='two_stage').dropna(how='any').index two_stage_df = two_stage_df.query('worker_id in %s' % list(successful_two_stage)) two_stage_df = two_stage_df.query('rt_first != -1 and feedback_last in [0,1]') colors = sns.hls_palette(2) plot_df = (1 - two_stage_df.groupby([ 'worker_id', 'stage_transition_last', 'feedback_last' ]).switch.mean()).reset_index() plot_df.feedback_last = plot_df.feedback_last.replace({ 0: 'Unrewarded', 1: 'Rewarded' }) plot_df.stage_transition_last = \
import pandas as pd import statsmodels.formula.api as smf from selfregulation.utils.utils import get_behav_data data = get_behav_data() problem_subj = data.filter(regex='simon').isnull().iloc[:,0] df = get_behav_data(file='Individual_Measures/simon.csv.gz') df = df.query('exp_stage == "test" and rt==rt') params = {} for worker in df.worker_id.unique(): if problem_subj.loc[worker] == True: continue subset = df.query('worker_id=="%s"' % worker) acc_contrast = subset.groupby('condition').correct.mean() acc_diff = rt_contrast['incongruent']-rt_contrast['congruent'] subset = subset.query('correct == True') rs = smf.ols(formula = 'rt ~ C(condition, Sum)', data = subset).fit() params[worker] = rs.params.tolist() params[worker][1]*=-2 rt_contrast = subset.groupby('condition').rt.median() diff = rt_contrast['incongruent']-rt_contrast['congruent'] params[worker].append(diff) params[worker].append(acc_diff) DVs = pd.DataFrame(params, index=['Intercept','model_diff', 'diff', 'acc_diff']).T
@author: ian """ from os import path from dimensional_structure.utils import transfer_scores from selfregulation.utils.utils import get_behav_data, get_info from selfregulation.utils.result_utils import load_results # get contextualizing results results_dataset = 'Complete_03-29-2018' results = load_results(datafile=results_dataset) # get fmri data fmri_dataset = 'Fmri_Followup_10-22-2018' data = get_behav_data(dataset=fmri_dataset) # remove data where participants are missing more than 20% of the tasks tasks = data.copy() tasks.columns = [i.split('.')[0] for i in tasks.columns] successful_subjects = (tasks.isnull().reset_index().melt(id_vars=['index']) \ .groupby(['index', 'variable']).mean() \ .groupby('index').sum()<12) successful_subjects = successful_subjects[successful_subjects['value']] data = data.loc[successful_subjects.index] task_scores = transfer_scores(data, results['task']) survey_scores = transfer_scores(data, results['survey']) # save the scores basename = 'factorscores_results-%s.csv' % results_dataset task_scores.to_csv(
} dvs['taste_sensitivity'] = { 'value': rs.params['taste_diff'], 'valence': 'Neg' } description = """ Both taste and health sensitivity are calculated based on the decision phase. On each trial the participant indicates whether they would prefer a food option over a reference food. Their choice is regressed on the subjective health and taste difference between that option and the reference item. Positive values indicate that the option's higher health/taste relates to choosing the option more often """ return dvs, description # get data df = get_behav_data(dataset='Discovery_11-20-2016', file='Individual_Measures/dietary_decision.csv.gz') demo = get_behav_data(dataset='Discovery_11-20-2016', file='demographic_targets.csv') # calc DVs DVs, description = calc_dietary_decision_DV(df) for key, val in DVs.items(): for subj_key in val.keys(): val[subj_key] = val[subj_key]['value'] DVs = pandas.DataFrame.from_dict(DVs).T # do it the simpler way DV, valence, description = calc_exp_DVs(df)
def calc_EFA_retest_held_out(results, rotate='oblimin', verbose=True): name = results.ID.split('_')[0].title() orig_data = results.data positive_skewed = [ i.replace('.logTr', '') for i in orig_data.columns if ".logTr" in i ] negative_skewed = [ i.replace('.ReflogTr', '') for i in orig_data.columns if ".ReflogTr" in i ] DVs = [ i.replace('.logTr', '').replace('.ReflogTr', '') for i in orig_data.columns ] orig_scores = results.EFA.get_scores(rotate=rotate) # load and clean retest data exactly like original data data_raw = get_behav_data(dataset=results.dataset, file='meaningful_variables.csv') retest_data_raw = get_behav_data(dataset=results.dataset.replace( 'Complete', 'Retest'), file='meaningful_variables.csv') shared_ids = set(retest_data_raw.index) & set(data_raw.index) data_raw = data_raw.loc[shared_ids, :] retest_data_raw = retest_data_raw.loc[shared_ids, :] raw_data = {'T1': data_raw, 'T2': retest_data_raw} imputed_data = {} for name, data in raw_data.items(): tmp_data = data.loc[:, DVs] tmp_data = transform_remove_skew(tmp_data, positive_skewed=positive_skewed, negative_skewed=negative_skewed) tmp_data = remove_outliers(tmp_data) tmp_data_imputed, error = missForest(tmp_data) scaled_tmp_data = scale(tmp_data_imputed) imputed_data[name] = scaled_tmp_data # get subjects not in the retest set ind_data = orig_data.loc[set(orig_data.index) - shared_ids] fa, output = psychFA(ind_data, results.EFA.results['num_factors'], method='ml', rotate=rotate) weights = get_attr(fa, 'weights') scores = {} for name, data in imputed_data.items(): suffix = '' if name == 'T2': suffix = 'T2' tmp_scores = pd.DataFrame( data.dot(weights), index=shared_ids, columns=[i + ' ' + suffix for i in orig_scores.columns]) scores[name] = tmp_scores combined = pd.concat([scores['T1'], scores['T2']], axis=1) cross_diag = [ combined.corr().iloc[i, i + len(orig_scores.columns)] for i in range(len(orig_scores.columns)) ] # get ICCs ICCs = [] for col in scores['T1'].columns: tmp = combined.filter(regex=col) out = psych.ICC(tmp) ICCs.append(list(out[0][1])[-1]) return combined, cross_diag, ICCs, (fa, output)
import numpy as np from selfregulation.utils.utils import get_behav_data, get_recent_dataset dataset = get_recent_dataset() items = get_behav_data(dataset=dataset, file='items.csv.gz') subject_items = get_behav_data(dataset=dataset, file='subject_x_items.csv') # get fmri items alone # filter for fmri items grit_items = [ 'New ideas and projects sometimes distract me from previous ones.', 'Setbacks don\'t discourage me.', 'I have been obsessed with a certain idea or project for a short time but later lost interest.', 'I am a hard worker.', 'I often set a goal but later choose to pursue a different one.', 'I have difficulty maintaining my focus on projects that take more than a few months to complete.', 'I finish whatever I begin.', 'I am diligent.' ] brief_items = [ 'I am good at resisting temptation.', 'I have a hard time breaking bad habits.', 'I am lazy.', 'I say inappropriate things.', 'I do certain things that are bad for me, if they are fun.', 'I refuse things that are bad for me.', 'I wish I had more self-discipline.', 'People would say that I have iron self-discipline.', 'Pleasure and fun sometimes keep me from getting work done.', 'I have trouble concentrating.', 'I am able to work effectively toward long-term goals.', 'Sometimes I can\'t stop myself from doing something, even if I know it is wrong.',
import numpy as np import pandas as pd from sklearn.linear_model import LinearRegression from dimensional_structure.prediction_utils import run_prediction from selfregulation.utils.result_utils import load_results from selfregulation.utils.utils import get_recent_dataset, get_behav_data results = load_results(get_recent_dataset()) data = get_behav_data(file='variables_exhaustive.csv') # get demographics full_demog = results['task'].DA.get_scores() full_demog.columns = ['full_' + c for c in full_demog.columns] demog = pd.read_csv('/home/ian/Downloads/demog_fa_scores_t1.csv', index_col=0) # get predictors ddm_factors = pd.read_csv('/home/ian/Downloads/ez_t1_fa_3_scores.csv', index_col=0) ontology_factors = results['task'].EFA.get_scores() ontology_ddm_factors = ontology_factors[['Speeded IP', 'Caution', 'Perc / Resp']] # # compare demographics diff = pd.DataFrame(demog.values - full_demog.loc[demog.index].values, index=demog.index, columns=demog.columns) corr = demog.join(full_demog).corr().iloc[:len(demog.columns), len(demog.columns):] # EZ vars EZ_vars = data.filter(regex='EZ_(non_decision|drift|thresh)$') hddm_vars = data.filter(regex='hddm_(non_decision|drift|thresh)$')
varNums=m['dataElements'][e]['subscaleVarNums'] else: varNums='' if k=='eating_survey': varNums='custom(%s)'%varNums measure_level_data.append([vname,shortname, m['title'],m['dataElements'][e]['title'], m['measureType'],varNums]) measure_level_df=pandas.DataFrame(measure_level_data, columns=['ExpFactoryName','ShortName','MeasureName','VariableName', 'MeasureType','SubscaleVarNums']) # doublecheck that all meaningful variables are here dataset=get_info('dataset') behavdata=get_behav_data(dataset) measurevars=measure_level_df.ExpFactoryName.tolist() for v in behavdata.columns: assert v in measurevars measure_level_df.to_csv('meaningful_variables_metadata.csv',index=False) #save item level data item_level_df=pandas.DataFrame(item_level_data, columns=['ExpFactoryName','MeasureName','QuestionNumber', 'QuestionText','Scoring','ResponseOptions']) item_level_df.to_csv('item_level_metadata.csv',index=False) outcome_df=pandas.DataFrame(outcome_data, columns=['ExpFactoryName','MeasureName', 'QuestionText','Scoring','ResponseOptions']) outcome_df.to_csv('outcome_metadata.csv',index=False)
ax = plt.gca() ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.text(5, 400, 'Completion Rate: %s' % completion_rate, size=20) ax.text(5, 350, 'Passed QC: %s' % analyzed_rate, size=20) plt.xlabel('Number of Tasks Completed', fontsize=20) plt.savefig(save_dir, dpi=300, bbox_inches='tight') """ # **************************************************************************** # plot psychometric reliability # **************************************************************************** sns.set_context('poster') dataset = get_recent_dataset() meaningful_vars = get_behav_data(dataset=dataset, file='meaningful_variables_imputed.csv').columns meaningful_vars = [i.replace('.logTr','') for i in meaningful_vars] meaningful_vars = [i.replace('.ReflogTr','') for i in meaningful_vars] retest_data = get_retest_data(dataset=dataset.replace('Complete','Retest')) # only select meaningful variables retest_data = retest_data.query('dv in %s' % list(meaningful_vars)) # create reliability dataframe measure_cat = [get_var_category(v).title() for v in retest_data.index] retest_data.loc[:,'Measure Category'] = measure_cat Survey_N = np.sum(retest_data.loc[:, 'Measure Category']=='Survey') Task_N = len(retest_data)-Survey_N def plot_retest_data(retest_data, size=4.6, save_dir=None): colors = [sns.color_palette('Reds_d',3)[0], sns.color_palette('Blues_d',3)[0]] f = plt.figure(figsize=(size,size*.75))
import numpy as np import pandas as pd from sklearn.linear_model import LinearRegression from dimensional_structure.prediction_utils import run_prediction from selfregulation.utils.result_utils import load_results from selfregulation.utils.utils import get_recent_dataset, get_behav_data results = load_results(get_recent_dataset()) data = get_behav_data(file='variables_exhaustive.csv') # get demographics full_demog = results['task'].DA.get_scores() full_demog.columns = ['full_' + c for c in full_demog.columns] demog = pd.read_csv('/home/ian/Downloads/demog_fa_scores_t1.csv', index_col=0) # get predictors ddm_factors = pd.read_csv('/home/ian/Downloads/ez_t1_fa_3_scores.csv', index_col=0) ontology_factors = results['task'].EFA.get_scores() ontology_ddm_factors = ontology_factors[[ 'Speeded IP', 'Caution', 'Perc / Resp' ]] # # compare demographics diff = pd.DataFrame(demog.values - full_demog.loc[demog.index].values, index=demog.index, columns=demog.columns) corr = demog.join(full_demog).corr().iloc[:len(demog.columns), len(demog.columns):]
@author: ian """ import numpy as np from os import makedirs, path import pandas as pd import pickle from sklearn.covariance import GraphLassoCV from sklearn.preprocessing import scale from dimensional_structure.graph_utils import Graph_Analysis from selfregulation.utils.utils import get_behav_data, get_recent_dataset from selfregulation.utils.result_utils import load_results from selfregulation.utils.r_to_py_utils import qgraph_cor dataset = get_recent_dataset() data = get_behav_data(dataset=dataset, file='meaningful_variables_imputed.csv') all_results = load_results(dataset) def get_EFA_HCA(results, EFA): if EFA == False: return results.HCA.results['data'] else: c = results.EFA.results['num_factors'] return results.HCA.results['EFA%s_oblimin' % c] EFA = True survey_HCA = get_EFA_HCA(all_results['survey'], EFA) survey_order = survey_HCA['reorder_vec'] task_HCA = get_EFA_HCA(all_results['task'], EFA)
from expanalysis.experiments.psychological_models import MoE_Model import numpy as np import pandas as pd from selfregulation.utils.utils import get_behav_data data = get_behav_data(file='Individual_Measures/hierarchical_rule.csv.gz') workers = data.worker_id.unique() data = data.query("worker_id == '%s'" % workers[0]) from scipy.optimize import minimize def eval_MoE(fit_args, passed_args): args = { 'kappa': fit_args[0], 'zeta': fit_args[1], 'xi': fit_args[2], 'beta2' : fit_args[3], 'beta3': fit_args[4], 'alphaC': fit_args[5], 'alphaO': fit_args[6], 'alphaS': fit_args[7], 'beta_hierarchy': fit_args[8], 'data': data, } MoE_model = MoE_Model(**args) likelihoods = [] for i, trial in data.iterrows(): if trial.key_press != -1:
# -*- coding: utf-8 -*- from expanalysis.experiments.ddm_utils import get_HDDM_fun, load_model from selfregulation.utils.utils import get_behav_data import hddm import numpy as np # test HDDM calculation from processed task task = 'stroop' df = get_behav_data(file='Individual_Measures/%s.csv.gz' % task) df = df.query('worker_id in %s' % list(df.worker_id.unique()[0:15])) fun = get_HDDM_fun(task, samples=20, burn=10, outfile='/home/ian/tmp/stroop', parallel=True) out = fun(df) acc = df.groupby('worker_id').correct.mean() rt = df.groupby('worker_id').rt.median() for var in ['hddm_drift', 'hddm_thresh', 'hddm_non_decision']: ddm_vars = [out[k][var]['value'] for k in sorted(out.keys())] print(var) print('Correlation with Acc: ', np.corrcoef(acc, ddm_vars)[0, 1]) print('Correlation with RT: ', np.corrcoef(rt, ddm_vars)[0, 1]) samples = 20 burn = 10 thin = 1 response_col = 'correct'
def __init__(self, datafile=None, loading_thresh=None, dist_metric=distcorr, boot_iter=1000, name='', filter_regex='.', ID=None, results_dir=None, residualize_vars=['Age', 'Sex'], saved_obj_file=None ): """ Args: datafile: name of a directory in "Data" loading_thresh: threshold to use for factor analytic result dist_metric: distance metric for hierarchical clustering that is passed to pdist name: string to append to ID, default to empty string filter_regex: regex string passed to data.filter ID: specify if a specific ID is desired results_dir: where to save results """ assert datafile is not None or saved_obj_file is not None # initialize with the saved object if available if saved_obj_file: self._load_init(saved_obj_file) else: # set vars self.dataset = datafile self.loading_thresh = None self.dist_metric = dist_metric self.boot_iter = boot_iter self.residualize_vars = residualize_vars if ID is None: self.ID = '%s_%s' % (name, str(random.getrandbits(16))) else: self.ID = '%s_%s' % (name, str(ID)) # set up output files self.results_dir = results_dir # load data self.data = get_behav_data(dataset=datafile, file='meaningful_variables_imputed.csv', filter_regex=filter_regex, verbose=True) self.data_no_impute = get_behav_data(dataset=datafile, file='meaningful_variables_clean.csv', filter_regex=filter_regex, verbose=True) self.demographics = get_demographics() # initialize analysis classes self.DA = Demographic_Analysis(self.demographics, residualize_vars=self.residualize_vars, boot_iter=self.boot_iter) self.EFA = EFA_Analysis(self.data, self.data_no_impute, boot_iter=self.boot_iter) self.HCA = HCA_Analysis(dist_metric=self.dist_metric) # load the results from the saved object if saved_obj_file: self._load_results(saved_obj_file)
import math import matplotlib.pyplot as plt import numpy as np from os import path import pandas as pd import seaborn as sns from selfregulation.utils.plot_utils import beautify_legend, format_num, format_variable_names from selfregulation.utils.utils import filter_behav_data, get_behav_data, get_demographics, get_info # correlation of ravens and literature # replication of "Intelligence and socioeconomic success: A meta-analytic # review of longitudinal research" base_dir = get_info('base_directory') ext = 'png' data = get_behav_data() demographics = get_demographics() data = data.loc[demographics.index] # get dataframe of intelligence measure (raven's progressive matrices) and demographics) df = pd.concat([data.filter(regex='raven'), demographics], axis=1) # get raven's reliability reliability = get_behav_data(dataset='Retest_02-03-2018', file='bootstrap_merged.csv.gz') raven_reliability = reliability.groupby('dv').icc.mean().filter( regex='raven')[0] # demographic reliabilities demo_reliabilities = [1.0] * demographics.shape[1] # correlations correlations = df.corr().filter(regex='raven').sort_values(
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Jul 10 18:47:50 2017 @author: ian """ import pickle from expanalysis.experiments.processing import calc_exp_DVs import pandas as pd from selfregulation.utils.utils import get_behav_data df = get_behav_data(file='Individual_Measures/two_stage_decision.csv.gz') workers = list(df.worker_id.unique()) df = df.query('worker_id in %s' % workers) DV_tests = [] for repeats in range(100): print(repeats) DVs, valence, description = calc_exp_DVs(df) DVs.columns = [i + '_run%s' % str(repeats) for i in DVs.columns] DV_tests.append(DVs) DV_tests = pd.concat(DV_tests, axis=1) DV_tests.to_pickle('two_stage_tests.pkl') N = len(DV_tests.columns) corr = DV_tests.corr() DV_reliabilities = {} for c in range(5):
# save gelman vals pickle.dump(gelman_vals, open(path.join('hddm_output', 'gelman_vals.pkl'), 'wb')) # ******************************************* # Overview of different ways we can assess HDDM convergence # Useful functions: # PYMC: https://healthyalgorithms.com/2010/10/19/mcmc-in-python-how-to-stick-a-statistical-model-on-a-system-dynamics-model-in-pymc/ # m.gen_stats: get stats matrix # m.print_stats: print stats matrix # m.dic_info: fit indices # ******************************************* task = 'stroop' N = 20 # number of workers full_data = get_behav_data(file='Individual_Measures/%s.csv.gz' % task) # let's only look at a few workers data = full_data.query('worker_id in %s' % list(full_data.worker_id.unique()[0:N])) outputs = run_model(task, data) m=outputs[0] # after the model is made is generates a .db file, a data file and a 'model' file # the data file is a transformation of the data we put in - with different scales for # rt, different columns, and some rows remove data_in = pd.read_csv('%s_data.csv' % task) # Example of extracting stats # get stats for individual drift rates stats= m.gen_stats().filter(regex='a_subj.', axis=0) # the mc error reflects the variance around the estimate stats['mc err']
out = clf.coef_ if len(out.shape) == 1: out = out.reshape(1, -1) out = pd.DataFrame(out, columns=scores.columns) out.index = data.columns return out # do mapping dataset = get_recent_dataset() # load ontology results = load_results(datafile=dataset) task_loadings = results['task'].EFA.get_loading() task_scores = results['task'].EFA.get_scores() # load all DVs all_DVs = get_behav_data(file='variables_exhaustive.csv', dataset=dataset) contrast_loadings = {} for contrast, relation in fmri_ontology_mapping.items(): # if relation is in the direct mapping if relation.lstrip('-') in task_loadings.index: task_loading = task_loadings.loc[relation.lstrip('-')] if relation[0] == '-': task_loading = task_loading * -1 # otherwise, reconstruct! else: unmapped_data = all_DVs.loc[:, relation.lstrip('-')] missing = unmapped_data[unmapped_data.isnull()].index task_loading = run_linear(pd.DataFrame(unmapped_data.drop(missing)), task_scores.drop(missing)).iloc[0, :] contrast_loadings[contrast] = task_loading
import os import numpy as np from selfregulation.utils.utils import get_behav_data import seaborn as sns #Make Plot Directy if it doesn't exist if not os.path.exists('Plots'): os.mkdir('Plots') # get DV df DV_df = get_behav_data() tasks = np.unique(DV_df.columns.map(lambda x: x.split('.')[0])) for task in tasks: subset = DV_df.filter(regex = '^%s' % task) subset = subset.dropna(how = 'all').dropna(axis = 1) sns.set(font_scale = 1.5) p = sns.pairplot(subset, kind = 'reg', size = 5, diag_kws = {'bins': 50}) p.savefig('Plots/%s_pair_plot.pdf' % task, dpi = 300)
@author: ian """ from os import path from dimensional_structure.utils import transfer_scores from selfregulation.utils.utils import get_behav_data, get_info from selfregulation.utils.result_utils import load_results # get contextualizing results results_dataset = 'Complete_03-29-2018' results = load_results(datafile=results_dataset) # get fmri data fmri_dataset= 'Fmri_Followup_10-22-2018' data = get_behav_data(dataset=fmri_dataset) # remove data where participants are missing more than 20% of the tasks tasks = data.copy() tasks.columns = [i.split('.')[0] for i in tasks.columns] successful_subjects = (tasks.isnull().reset_index().melt(id_vars=['index']) \ .groupby(['index', 'variable']).mean() \ .groupby('index').sum()<12) successful_subjects = successful_subjects[successful_subjects['value']] data = data.loc[successful_subjects.index] task_scores = transfer_scores(data, results['task']) survey_scores = transfer_scores(data, results['survey']) # save the scores basename = 'factorscores_results-%s.csv' % results_dataset task_scores.to_csv(path.join(get_info('base_directory'),