def calc_EFA_retest_held_out(results, rotate='oblimin', verbose=True):
    name = results.ID.split('_')[0].title()
    orig_data = results.data
    positive_skewed = [i.replace('.logTr', '') for i in orig_data.columns if ".logTr" in i]
    negative_skewed = [i.replace('.ReflogTr', '') for i in orig_data.columns if ".ReflogTr" in i]
    DVs = [i.replace('.logTr','').replace('.ReflogTr','') for i in orig_data.columns]
    orig_scores = results.EFA.get_scores(rotate=rotate)
    
    # load and clean retest data exactly like original data
    data_raw = get_behav_data(dataset=results.dataset,
                              file='meaningful_variables.csv')
    retest_data_raw = get_behav_data(dataset=results.dataset.replace('Complete','Retest'),
                                     file='meaningful_variables.csv')
    shared_ids = set(retest_data_raw.index) & set(data_raw.index)
    data_raw = data_raw.loc[shared_ids, :]
    retest_data_raw = retest_data_raw.loc[shared_ids, :]
    raw_data = {'T1': data_raw, 'T2': retest_data_raw}
    imputed_data = {}
    for name, data in raw_data.items():  
        tmp_data = data.loc[:, DVs]
        tmp_data = transform_remove_skew(tmp_data,
                                         positive_skewed=positive_skewed,
                                         negative_skewed=negative_skewed)
        tmp_data = remove_outliers(tmp_data)
        tmp_data_imputed, error = missForest(tmp_data)  
        scaled_tmp_data = scale(tmp_data_imputed)
        imputed_data[name] = scaled_tmp_data
    
    # get subjects not in the retest set
    ind_data = orig_data.loc[set(orig_data.index)-shared_ids]
    fa, output = psychFA(ind_data, results.EFA.results['num_factors'], 
                         method='ml', rotate=rotate)
    weights = get_attr(fa, 'weights')
    scores = {}
    for name, data in imputed_data.items():
        suffix=''
        if name=='T2': suffix='T2'
        tmp_scores = pd.DataFrame(data.dot(weights),
                                  index=shared_ids,
                                  columns=[i+' '+suffix for i in orig_scores.columns])
        scores[name] = tmp_scores
    combined = pd.concat([scores['T1'], scores['T2']], axis=1)
    cross_diag = [combined.corr().iloc[i,i+len(orig_scores.columns)] 
                    for i in range(len(orig_scores.columns))]
        # get ICCs
    ICCs = []
    for col in scores['T1'].columns:
        tmp = combined.filter(regex=col)
        out = psych.ICC(tmp)
        ICCs.append(list(out[0][1])[-1])
    return combined, cross_diag, ICCs, (fa, output)
def calc_EFA_retest(results, rotate='oblimin', verbose=True):
    name = results.ID.split('_')[0].title()
    retest_data_raw = get_behav_data(dataset=results.dataset.replace(
        'Complete', 'Retest'),
                                     file='meaningful_variables.csv')
    shared_ids = set(retest_data_raw.index) & set(results.data.index)
    retest_data_raw = retest_data_raw.loc[shared_ids, :]
    retest_scores = transfer_scores(retest_data_raw, results, rotate=rotate)
    retest_scores.columns = [str(i) + ' Retest' for i in retest_scores.columns]
    # scale and perform the factor score transformation
    EFA = results.EFA
    c = EFA.get_c()
    ref_scores = EFA.get_scores(c=c,
                                rotate=rotate).loc[retest_data_raw.index, :]

    # reorder scores
    if rotate == 'oblimin':
        reorder_vec = EFA.get_factor_reorder(c, rotate=rotate)
        ref_scores = ref_scores.iloc[:, reorder_vec]
        retest_scores = retest_scores.iloc[:, reorder_vec]
    combined = pd.concat([ref_scores, retest_scores], axis=1)
    cross_diag = np.diag(combined.corr().iloc[c:, :c])
    # get ICCs
    ICCs = []
    for col in ref_scores.columns:
        tmp = combined.filter(regex=str(col))
        out = psych.ICC(tmp)
        ICCs.append(list(out[0][1])[-1])
    if verbose:
        print('%s, Avg Correlation: %s\n' %
              (name, format_num(np.mean(cross_diag))))
        for factor, num in zip(ref_scores.columns, cross_diag):
            print('%s: %s' % (factor, format_num(num)))
    return combined, cross_diag, ICCs
def calc_EFA_retest(results, rotate='oblimin', verbose=True):
    name = results.ID.split('_')[0].title()    
    retest_data_raw = get_behav_data(dataset=results.dataset.replace('Complete','Retest'),
                                     file='meaningful_variables.csv')
    shared_ids = set(retest_data_raw.index) & set(results.data.index)
    retest_data_raw = retest_data_raw.loc[shared_ids, :]
    retest_scores = transfer_scores(retest_data_raw, results, rotate=rotate)
    retest_scores.columns = [str(i)+' Retest' for i in retest_scores.columns]
    # scale and perform the factor score transformation
    EFA = results.EFA
    c = EFA.get_c()
    ref_scores = EFA.get_scores(c=c, rotate=rotate).loc[retest_data_raw.index, :]

    # reorder scores
    if rotate == 'oblimin':
        reorder_vec = EFA.get_factor_reorder(c, rotate=rotate)
        ref_scores = ref_scores.iloc[:, reorder_vec]
        retest_scores = retest_scores.iloc[:, reorder_vec]
    combined = pd.concat([ref_scores, retest_scores], axis=1)
    cross_diag = np.diag(combined.corr().iloc[c:, :c])
    # get ICCs
    ICCs = []
    for col in ref_scores.columns:
        tmp = combined.filter(regex=str(col))
        out = psych.ICC(tmp)
        ICCs.append(list(out[0][1])[-1])
    if verbose:
        print('%s, Avg Correlation: %s\n' % (name, format_num(np.mean(cross_diag))))
        for factor, num in zip(ref_scores.columns, cross_diag):
            print('%s: %s' % (factor, format_num(num)))
    return combined, cross_diag, ICCs
def assess_convergence(task, reps=5):
    # load data
    data = get_behav_data(file='Individual_Measures/%s.csv.gz' % task)
    data = data.query('worker_id in %s' % list(data.worker_id.unique()[0:20]))
    outputs = []
    for _ in range(reps):
        output = run_model(task, data)
        outputs.append(output)
    return {task: outputs}
Пример #5
0
def assess_convergence(task, reps=5):
    # load data
    data = get_behav_data(file='Individual_Measures/%s.csv.gz' % task)
    data = data.query('worker_id in %s' % list(data.worker_id.unique()[0:20]))
    outputs = []
    for _ in range(reps):
        output = run_model(task, data)
        outputs.append(output)
    return {task: outputs}
def get_retest_comparison_data():
    subsets = ['meaningful_variables_noDDM.csv', 'meaningful_variables_EZ.csv',
               'meaningful_variables_hddm.csv']
    dataset = pd.DataFrame()
    for subset in subsets:
        df = get_behav_data(file=subset)
        df_clean = remove_outliers(df)
        df_clean = transform_remove_skew(df_clean)
        drop_columns = set(dataset) & set(df_clean)
        df_clean.drop(labels=drop_columns, axis=1, inplace=True)
        dataset = pd.concat([dataset, df_clean], axis=1)
    return dataset
Пример #7
0
def get_retest_comparison_data():
    subsets = [
        'meaningful_variables_noDDM.csv', 'meaningful_variables_EZ.csv',
        'meaningful_variables_hddm.csv'
    ]
    dataset = pd.DataFrame()
    for subset in subsets:
        df = get_behav_data(file=subset)
        df_clean = remove_outliers(df)
        df_clean = transform_remove_skew(df_clean)
        drop_columns = set(dataset) & set(df_clean)
        df_clean.drop(labels=drop_columns, axis=1, inplace=True)
        dataset = pd.concat([dataset, df_clean], axis=1)
    return dataset
import matplotlib.pyplot as plt
from os import path
import pandas as pd
import seaborn as sns
from selfregulation.utils.plot_utils import beautify_legend, format_num, format_variable_names
from selfregulation.utils.utils import get_behav_data, get_demographics, get_info

base_dir = get_info('base_directory')
ext= 'png'
data = get_behav_data()

# *************************************************************************
# Successful replications
# ************************************************************************
# two_stage
two_stage_df = get_behav_data(file='Individual_Measures/two_stage_decision.csv.gz')
# subset two subjects who passed quality control
successful_two_stage = data.filter(regex='two_stage').dropna(how='any').index
two_stage_df = two_stage_df.query('worker_id in %s' % list(successful_two_stage))
two_stage_df = two_stage_df.query('rt_first != -1 and feedback_last in [0,1]')
colors = sns.hls_palette(2)
plot_df = (1-two_stage_df.groupby(['worker_id','stage_transition_last','feedback_last']).switch.mean()).reset_index()
plot_df.feedback_last = plot_df.feedback_last.replace({0:'Unrewarded', 1:'Rewarded'})
plot_df.stage_transition_last = \
    plot_df.stage_transition_last.replace({'infrequent':'Rare', 'frequent':'Common'})
# shift
shift_df = get_behav_data(file='Individual_Measures/shift_task.csv.gz')
# subset two subjects who passed quality control
successful_shift = data.filter(regex='shift').dropna(how='any').index
shift_df = shift_df.query('worker_id in %s' % list(successful_shift))
shift_df = shift_df.query('rt != -1')
import pandas as pd
import statsmodels.formula.api as smf
from selfregulation.utils.utils import get_behav_data
data = get_behav_data()
problem_subj = data.filter(regex='simon').isnull().iloc[:, 0]
df = get_behav_data(file='Individual_Measures/simon.csv.gz')
df = df.query('exp_stage == "test" and rt==rt')

params = {}
for worker in df.worker_id.unique():
    if problem_subj.loc[worker] == True:
        continue
    subset = df.query('worker_id=="%s"' % worker)
    acc_contrast = subset.groupby('condition').correct.mean()
    acc_diff = rt_contrast['incongruent'] - rt_contrast['congruent']
    subset = subset.query('correct == True')
    rs = smf.ols(formula='rt ~ C(condition, Sum)', data=subset).fit()
    params[worker] = rs.params.tolist()
    params[worker][1] *= -2
    rt_contrast = subset.groupby('condition').rt.median()
    diff = rt_contrast['incongruent'] - rt_contrast['congruent']
    params[worker].append(diff)
    params[worker].append(acc_diff)

DVs = pd.DataFrame(params,
                   index=['Intercept', 'model_diff', 'diff', 'acc_diff']).T
        else:
            d=[]
            for i in subscale_data.loc[v]:
                try:
                    d.append(str(int(i)))
                except:
                    pass
            subscale_var_dict[v]=':'.join(d)
    return subscale_var_dict

subscale_var_dict=get_subscale_vars()

# first get variable-level metadata
# based on variable set in meaningful_variables

behavdata=get_behav_data(dataset)
measures={}
for c in list(behavdata.columns):
    c_s=c.split('.')
    m=c_s[0]
    v='.'.join(c_s[1:])
    if not m in measures:
        measures[m]={'dataElements':[]}
    measures[m]['dataElements'].append(v)

metadata={}
# three entries are: class, type, and whether we are looking at beginning
# of string - this lets us find differences
task_vars=[('hddm_drift','DDMDriftRate','rate',True),
            ("hddm_non_decision",'DDMNondecisionTime','seconds',True),
            ('hddm_thresh','DDMThreshold','other',True),
Пример #11
0
                    label=name,
                    ax=ax1,
                    scatter_kws={
                        's': 100,
                        'alpha': .4
                    })
    ax1.legend()
    sns.boxplot('split_time', 'value', hue='variable', data=melted, ax=ax2)
    if title:
        plt.suptitle(title, fontsize=18)
    plt.show()


verbose = True
# load data
behav_data = get_behav_data(file='meaningful_variables_imputed.csv')
measures = np.unique([i.split('.')[0] for i in behav_data.columns])
time_effects = {}

for measure_name in measures[0:10]:
    measure = get_behav_data(file='Individual_Measures/%s.csv.gz' %
                             measure_name)
    measure_DVs = behav_data.filter(regex=measure_name)
    measure_DVs.columns = [i.split('.')[1] for i in measure_DVs.columns]
    # scale
    measure_DVs = pd.DataFrame(scale(measure_DVs),
                               index=measure_DVs.index,
                               columns=measure_DVs.columns)

    finishtimes = measure.groupby('worker_id').finishtime.apply(
        lambda x: np.unique(x)[0])
    out = clf.coef_
    if len(out.shape)==1:
        out = out.reshape(1,-1)
    out = pd.DataFrame(out, columns=scores.columns)
    out.index = data.columns
    return out

# do mapping
dataset = get_recent_dataset()
# load ontology
results = load_results(datafile=dataset)
task_loadings = results['task'].EFA.get_loading()
task_scores = results['task'].EFA.get_scores()
# load all DVs
all_DVs = get_behav_data(file='variables_exhaustive.csv', dataset=dataset)



contrast_loadings = {}
for contrast, relation in fmri_ontology_mapping.items():
    # if relation is in the direct mapping
    if relation.lstrip('-') in task_loadings.index:
        task_loading = task_loadings.loc[relation.lstrip('-')]
        if relation[0] == '-':
            task_loading = task_loading*-1
    # otherwise, reconstruct!
    else:
        unmapped_data = all_DVs.loc[:,relation.lstrip('-')]
        missing = unmapped_data[unmapped_data.isnull()].index
        task_loading = run_linear(pd.DataFrame(unmapped_data.drop(missing)), 
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 16 20:13:32 2017

@author: ian
"""
import glob
import pandas as pd
from selfregulation.utils.utils import get_behav_data




df = get_behav_data(file='demographic_health.csv', full_dataset=True)

failed_dataset = sorted(glob.glob('../Data/Failed*'))[0]
failed_subjects = get_behav_data(dataset=failed_dataset, file='demographic_health.csv')


fmri_dataset = sorted(glob.glob('../Data/Fmri*'))[0]
fmri_subjects = get_behav_data(dataset=fmri_dataset, file='demographic_health.csv')
fmri_subjects.index = ['fmri_'+i for i in fmri_subjects.index]

all_subjects = pd.concat([df,failed_subjects,fmri_subjects])

# total of 662 workers in mturk sample
worker_counts = pd.read_json('../Data/Local/worker_counts.json', typ='series')
total_workers = len(worker_counts)+len(fmri_subjects)

# separate into groups
import matplotlib.pyplot as plt
import numpy as np
from os import path
import pandas as pd
import seaborn as sns
from selfregulation.utils.plot_utils import beautify_legend
from selfregulation.utils.utils import get_behav_data, get_info

base_dir = get_info('base_directory')
ext = 'png'
data = get_behav_data()

# our data
dpx = get_behav_data(file='Individual_Measures/dot_pattern_expectancy.csv.gz')
dpx = dpx.query('exp_stage != "practice"')
N = len(dpx.worker_id.unique())
acc = 1 - dpx.query('rt!=-1').groupby(['worker_id', 'condition'
                                       ]).correct.mean()
acc_stats = acc.groupby('condition').agg(["mean", "std"])

rt = dpx.groupby(['worker_id', 'condition']).rt.mean()
rt_stats = rt.groupby('condition').agg(["mean", "std"])

# literature data
#replicate "The neural circuitry supporting goal maintenance during cognitive control: a comparison of expectancy AX-CPT and dot probe expectancy paradigms"
literature_data = {
    'acc': {
        'mean': {
            'AX': .98,
            'AY': .84,
            'BX': .92,
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 10 18:47:50 2017

@author: ian
"""
import pickle
from expanalysis.experiments.processing import calc_exp_DVs
import pandas as pd
from selfregulation.utils.utils import get_behav_data

df = get_behav_data(file = 'Individual_Measures/two_stage_decision.csv.gz')

workers = list(df.worker_id.unique())

df = df.query('worker_id in %s' % workers)

DV_tests = []
for repeats in range(100):
    print(repeats)
    DVs, valence, description = calc_exp_DVs(df)
    DVs.columns = [i + '_run%s' % str(repeats) for i in DVs.columns]
    DV_tests.append(DVs)
DV_tests = pd.concat(DV_tests, axis=1)
DV_tests.to_pickle('two_stage_tests.pkl')

N = len(DV_tests.columns)
corr = DV_tests.corr()
DV_reliabilities = {}
for c in range(5):
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 23 14:56:35 2016

@author: ian
"""
from math import floor, ceil
import numpy
import pandas as pd
from selfregulation.utils.utils import get_behav_data



df = get_behav_data(dataset = 'Discovery_11-20-2016', file = 'Individual_Measures/stop_signal.csv.gz')

worker = 0
df = df.query('worker_id == "%s"' % df.worker_id.unique()[worker])

#remove practice
df = df.query('exp_stage not in ["practice","NoSS_practice"]').reset_index(drop = True)

dvs = {}
# Calculate SSRT for both conditions
for c in df.condition.unique():
    c_df = df[df.condition == c]
    
    #SSRT
    go_trials = c_df.query('SS_trial_type == "go"')
    stop_trials = c_df.query('SS_trial_type == "stop"')
    sorted_go = go_trials.query('rt != -1').rt.sort_values(ascending = True)
    prob_stop_failure = (1-stop_trials.stopped.mean())
from selfregulation.utils.utils import get_behav_data, get_info

import bct
import igraph
import numpy as np
from os.path import join, exists
from os import makedirs
import pandas as pd
import seaborn as sns

# generic variables
save_plots = False
plot_dir = join(get_info('base_directory'),'dimensional_structure','Plots')

# get dependent variables
graph_data = get_behav_data(file = 'taskdata_imputed.csv')  



def run_graph_analysis(adj_dict, save_plots=False):
    """
    Takes in a dictionary with two keys: "name" and "adj", specifying
    an adjacency matrix (as a dataframe) and its corresponding name
    """
    def plot_name(name):
        return join(plot_dir,adj_name,name)
        
    adj_name = adj_dict['name']
    adj = adj_dict['adj']
    # if saving plots, make sure directory exists
    if save_plots: 
Пример #18
0
from expanalysis.experiments.psychological_models import MoE_Model
import numpy as np
import pandas as pd
from selfregulation.utils.utils import get_behav_data

data = get_behav_data(file='Individual_Measures/hierarchical_rule.csv.gz')
workers = data.worker_id.unique()
data = data.query("worker_id == '%s'" % workers[0])

from scipy.optimize import minimize


def eval_MoE(fit_args, passed_args):
    args = {
        'kappa': fit_args[0],
        'zeta': fit_args[1],
        'xi': fit_args[2],
        'beta2': fit_args[3],
        'beta3': fit_args[4],
        'alphaC': fit_args[5],
        'alphaO': fit_args[6],
        'alphaS': fit_args[7],
        'beta_hierarchy': fit_args[8],
        'data': data,
    }

    MoE_model = MoE_Model(**args)
    likelihoods = []
    for i, trial in data.iterrows():
        if trial.key_press != -1:
            action_probs = MoE_model.get_action_probs(trial)
Пример #19
0
                       for _ in range(n - len(subset))]
        subset = list(subset) + new_indices
        subset = pd.unique(subset)
    return list(subset)


def impute(data, method):
    sigma = data.std()
    matrix = (data / sigma).as_matrix()
    complete_matrix = method().fit_transform(matrix) * sigma.tolist()
    return pd.DataFrame(complete_matrix,
                        index=data.index,
                        columns=data.columns)


DV_df = get_behav_data('Discovery_9-26-16', use_EZ=True)
sigma = DV_df.std()

base_matrix = (DV_df / sigma).as_matrix()

# test different imputation methods
methods = [fancyimpute.SoftImpute, fancyimpute.IterativeSVD, fancyimpute.KNN]
correlations = {}
percent_off = {}
for method in methods:
    print('using %s' % method)
    correlations[method] = []
    percent_off[method] = []
    for simulation in range(20):
        indices = get_rand_index(base_matrix, 1000)
        originals = [base_matrix[i] for i in indices]
Пример #20
0
from selfregulation.utils.utils import get_behav_data, get_info

import bct
import igraph
import numpy as np
from os.path import join, exists
from os import makedirs
import pandas as pd
import seaborn as sns

# generic variables
save_plots = False
plot_dir = join(get_info('base_directory'), 'dimensional_structure', 'Plots')

# get dependent variables
graph_data = get_behav_data(file='taskdata_imputed.csv')


def run_graph_analysis(adj_dict, save_plots=False):
    """
    Takes in a dictionary with two keys: "name" and "adj", specifying
    an adjacency matrix (as a dataframe) and its corresponding name
    """
    def plot_name(name):
        return join(plot_dir, adj_name, name)

    adj_name = adj_dict['name']
    adj = adj_dict['adj']
    # if saving plots, make sure directory exists
    if save_plots:
        makedirs(join(plot_dir, adj_name), exist_ok=True)
    subset = [(randint(0,M.shape[0]-1), randint(0,M.shape[1]-1)) for _ in range(n)]
    subset = pd.unique(subset)
    while len(subset) < n:
        new_indices = [(randint(0,M.shape[0]-1), randint(0,M.shape[1]-1)) for _ in range(n-len(subset))]
        subset = list(subset) + new_indices
        subset = pd.unique(subset)
    return list(subset)

def impute(data, method):
    sigma = data.std()
    matrix = (data/sigma).as_matrix()
    complete_matrix = method().fit_transform(matrix)*sigma.tolist()
    return pd.DataFrame(complete_matrix, index = data.index, columns = data.columns)
    
   
DV_df = get_behav_data('Discovery_9-26-16', use_EZ = True)
sigma = DV_df.std()

base_matrix = (DV_df/sigma).as_matrix()

# test different imputation methods
methods = [fancyimpute.SoftImpute, fancyimpute.IterativeSVD, fancyimpute.KNN] 
correlations = {}
percent_off = {}
for method in methods:
    print('using %s' % method)
    correlations[method] = []
    percent_off[method] = []
    for simulation in range(20):
        indices = get_rand_index(base_matrix,1000)
        originals = [base_matrix[i] for i in indices]
import os
import numpy as np
from selfregulation.utils.utils import get_behav_data
import seaborn as sns

#Make Plot Directy if it doesn't exist
if not os.path.exists('Plots'):
    os.mkdir('Plots')

# get DV df
DV_df = get_behav_data()
tasks = np.unique(DV_df.columns.map(lambda x: x.split('.')[0]))

for task in tasks:
    subset = DV_df.filter(regex='^%s' % task)
    subset = subset.dropna(how='all').dropna(axis=1)
    sns.set(font_scale=1.5)
    p = sns.pairplot(subset, kind='reg', size=5, diag_kws={'bins': 50})
    p.savefig('Plots/%s_pair_plot.pdf' % task, dpi=300)
# parse args
run_factors = not args.skip_factors
run_raw = not args.run_raw
classifier = args.classifier
raw_classifier = args.raw_classifier
shuffle_reps = args.shuffle_repeats
EFA_rotation = args.EFA_rotation

results_dir = path.join(get_info('results_directory'), 'ideology_prediction')
makedirs(results_dir, exist_ok=True)

# load data
dataset = get_recent_dataset()
results = load_results(dataset)
ideo_data = get_behav_data(dataset, file='ideology.csv')

# get demographics 
ideo_demographics = get_behav_data(dataset, file='ideology_demographics.csv')
# fill in ideo demographics from demographics if needed
demographics = get_demographics()
# fill gender
missing_gender = ideo_demographics[ideo_demographics['Gender'].isnull()].index
ideo_demographics.loc[missing_gender, 'Gender'] = demographics.loc[missing_gender, 'Sex']
# Age can be off by a year potentially by the time the ideological data was collected
missing_age = ideo_demographics[ideo_demographics['Age'].isnull()].index
ideo_demographics.loc[missing_age, 'Age'] = demographics.loc[missing_age, 'Age']

# reduce dataset to where we have full demographics
ideo_demographics = ideo_demographics[ideo_demographics.isnull().sum(1)==0]
ideo_data = ideo_data.loc[ideo_demographics.index]
    for i,r in data.iterrows():
        itemoptions=eval(r.options)
        item_ids.append('_'.join(itemoptions[0]['id'].split('_')[:-1]))
    data['item_id']=item_ids
    return data

def save_data(data,survey_metadata,
              outdir=os.path.join(outdir,'surveydata')):

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    for k in survey_metadata.keys():
        matchdata=data.query("survey=='%s'"%k)
        unique_items=list(matchdata.item_id.unique())
        surveydata=pandas.DataFrame({'worker':list(matchdata.worker.unique())})
        for i in unique_items:
            matchitem=matchdata.query('item_id=="%s"'%i)
            matchitem=pandas.DataFrame({'worker':matchitem.worker,i:matchitem.coded_response})
            surveydata=surveydata.merge(matchitem,on='worker')
        surveydata.to_csv(os.path.join(outdir,'%s.tsv'%k),sep='\t',index=False)
    return outdir

if __name__=='__main__':
    data=get_behav_data(file='items.csv.gz')
    survey_items=get_survey_items(data)
    survey_metadata,metadatdir=save_metadata(survey_items)
    #data=add_survey_item_labels(data)
    #datadir=save_data(data,survey_metadata)
    pickle.dump(survey_metadata,open(os.path.join(outdir,'survey_metadata.pkl'),'wb'))
import math
import matplotlib.pyplot as plt
import numpy as np
from os import path
import pandas as pd
import seaborn as sns
from selfregulation.utils.plot_utils import beautify_legend, format_num, format_variable_names
from selfregulation.utils.utils import filter_behav_data, get_behav_data, get_demographics, get_info

# correlation of ravens and literature
# replication of "Intelligence and socioeconomic success: A meta-analytic
# review of longitudinal research"

base_dir = get_info('base_directory')
ext= 'png'
data = get_behav_data()     
demographics = get_demographics()                    
data = data.loc[demographics.index]     
# get dataframe of intelligence measure (raven's progressive matrices) and demographics)                              
df = pd.concat([data.filter(regex='raven'), demographics], axis=1)

# get raven's reliability
reliability = get_behav_data(dataset='Retest_02-03-2018', file='bootstrap_merged.csv.gz')
raven_reliability = reliability.groupby('dv').icc.mean().filter(regex='raven')[0]
# demographic reliabilities 
demo_reliabilities = [1.0]*demographics.shape[1]

# correlations
correlations = df.corr().filter(regex='raven').sort_values(by='ravens.score').iloc[:-1]
correlations.insert(0, 'target_reliability', demo_reliabilities)
adjusted = correlations['ravens.score']/(raven_reliability*correlations['target_reliability'])**.5
    :return dv: dictionary of dependent variables
    :return description: descriptor of DVs
    """
    df = df[~ pandas.isnull(df['taste_diff'])].reset_index(drop = True)
    df = df.query('mouse_click != "-1"')
    rs = smf.ols(formula = 'coded_response ~ health_diff + taste_diff', data = df).fit()
    dvs['health_sensitivity'] = {'value':  rs.params['health_diff'], 'valence': 'Pos'} 
    dvs['taste_sensitivity'] = {'value':  rs.params['taste_diff'], 'valence': 'Neg'} 
    description = """
        Both taste and health sensitivity are calculated based on the decision phase.
        On each trial the participant indicates whether they would prefer a food option
        over a reference food. Their choice is regressed on the subjective health and
        taste difference between that option and the reference item. Positive values
        indicate that the option's higher health/taste relates to choosing the option
        more often
    """
    return dvs,description
    
# get data
df = get_behav_data(dataset = 'Discovery_11-20-2016', file = 'Individual_Measures/dietary_decision.csv.gz')
demo = get_behav_data(dataset = 'Discovery_11-20-2016', file = 'demographic_targets.csv')

# calc DVs
DVs, description = calc_dietary_decision_DV(df)
for key,val in DVs.items():
    for subj_key in val.keys():
        val[subj_key]=val[subj_key]['value']
DVs = pandas.DataFrame.from_dict(DVs).T
        
# do it the simpler way     
DV, valence, description = calc_exp_DVs(df)
    return (time.hour-5)%24 # convert from GMT to CST

def plot_time_effects(measure_DVs, melted_DVs, title=None):
    f, (ax1,ax2) = plt.subplots(1, 2, figsize=(16,8))
    for name in measure_DVs.columns[:-2]:
        sns.regplot('hour', name, data=measure_DVs, lowess=True, label=name,
                    ax=ax1, scatter_kws={'s': 100, 'alpha': .4})
    ax1.legend()
    sns.boxplot('split_time', 'value', hue='variable', data=melted, ax=ax2)
    if title:
        plt.suptitle(title, fontsize=18)
    plt.show()
    
verbose=True
# load data    
behav_data = get_behav_data(file='meaningful_variables_imputed.csv')
measures = np.unique([i.split('.')[0] for i in behav_data.columns])
time_effects = {}

for measure_name in measures[0:10]:
    measure = get_behav_data(file='Individual_Measures/%s.csv.gz' % measure_name)
    measure_DVs = behav_data.filter(regex=measure_name)
    measure_DVs.columns = [i.split('.')[1] for i in measure_DVs.columns]
    # scale
    measure_DVs = pd.DataFrame(scale(measure_DVs), index=measure_DVs.index, columns=measure_DVs.columns)
    
    finishtimes = measure.groupby('worker_id').finishtime.apply(lambda x: np.unique(x)[0])
    daytime = finishtimes.apply(convert_to_time)
    daytime.name='hour'
    measure_DVs = pd.concat([measure_DVs, daytime], axis=1)
    # add on time split in half and melt
Пример #28
0
 def __init__(self, 
              datafile=None, 
              loading_thresh=None,
              dist_metric=distcorr,
              boot_iter=1000,
              name='',
              filter_regex='.',
              ID=None,
              results_dir=None,
              residualize_vars=['Age', 'Sex'],
              saved_obj_file=None
              ):
     """
     Args:
         datafile: name of a directory in "Data"
         loading_thresh: threshold to use for factor analytic result
         dist_metric: distance metric for hierarchical clustering that is 
         passed to pdist
         name: string to append to ID, default to empty string
         filter_regex: regex string passed to data.filter
         ID: specify if a specific ID is desired
         results_dir: where to save results
     """
     assert datafile is not None or saved_obj_file is not None
     # initialize with the saved object if available
     if saved_obj_file:
         self._load_init(saved_obj_file)
     else:
         # set vars
         self.dataset = datafile
         self.loading_thresh = None
         self.dist_metric = dist_metric
         self.boot_iter = boot_iter
         self.residualize_vars = residualize_vars
         if ID is None:
             self.ID =  '%s_%s' % (name, str(random.getrandbits(16)))
         else:
             self.ID = '%s_%s' % (name, str(ID))
         # set up output files
         self.results_dir = results_dir
         # load data
         self.data = get_behav_data(dataset=datafile, 
                                   file='meaningful_variables_imputed.csv',
                                   filter_regex=filter_regex,
                                   verbose=True)
         self.data_no_impute = get_behav_data(dataset=datafile,
                                              file='meaningful_variables_clean.csv',
                                              filter_regex=filter_regex,
                                              verbose=True)
         self.demographics = get_demographics()
         
     
     # initialize analysis classes
     self.DA = Demographic_Analysis(self.demographics, 
                                    residualize_vars=self.residualize_vars,
                                    boot_iter=self.boot_iter)
     self.EFA = EFA_Analysis(self.data, 
                             self.data_no_impute, 
                             boot_iter=self.boot_iter)
     self.HCA = HCA_Analysis(dist_metric=self.dist_metric)
     
     # load the results from the saved object
     if saved_obj_file:
         self._load_results(saved_obj_file)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 16 20:13:32 2017

@author: ian
"""
import glob
import pandas as pd
from selfregulation.utils.utils import get_behav_data

df = get_behav_data(file='demographic_health.csv', full_dataset=True)

failed_dataset = sorted(glob.glob('../Data/Failed*'))[0]
failed_subjects = get_behav_data(dataset=failed_dataset,
                                 file='demographic_health.csv')

fmri_dataset = sorted(glob.glob('../Data/Fmri*'))[0]
fmri_subjects = get_behav_data(dataset=fmri_dataset,
                               file='demographic_health.csv')
fmri_subjects.index = ['fmri_' + i for i in fmri_subjects.index]

all_subjects = pd.concat([df, failed_subjects, fmri_subjects])

# total of 662 workers in mturk sample
worker_counts = pd.read_json('../Data/Local/worker_counts.json', typ='series')
total_workers = len(worker_counts) + len(fmri_subjects)

# separate into groups
groups = all_subjects.groupby(['HispanicLatino', 'Sex', 'Race']).Age.count()
from selfregulation.utils.utils import get_info,get_behav_data
basedir=get_info('base_directory')
#dataset=get_info('dataset')
if usefull:
    print('using full dataset')
    derived_dir=os.path.join(basedir,'Data/Derived_Data/%s'%dataset.replace('Discovery','Combined').replace('Validation','Combined'))
else:
    print('using dataset:',dataset)
    derived_dir=os.path.join(basedir,'Data/Derived_Data/%s'%dataset)
datadir=os.path.join(basedir,'data/%s'%dataset)

if not os.path.exists(derived_dir):
    os.makedirs(derived_dir)
print('saving to',derived_dir)

data=get_behav_data(file='subject_x_items.csv',full_dataset=usefull)

maxnans=5

fixdata=data.copy()
dropped={}
fixed={}
for c in data.columns:

    f,dropflag=cleanup_item_dist(c,fixdata,verbose=False,minresp=min_freq)
    fixdata[c]=f
    u,h=get_respdist(f)
    if numpy.sum(numpy.isnan(data[c]),0)>maxnans:
        print('dropping %s due to too many NaNs'%c)
        dropflag=True
    if dropflag:
Пример #31
0
from expanalysis.experiments.psychological_models import fRL_Model
import numpy as np
import pandas as pd
from selfregulation.utils.utils import get_behav_data

data = get_behav_data(file='Individual_Measures/shift_task.csv.gz')
workers = data.worker_id.unique()

# test divergence between hierarchical and flat experts after training
models = []
for worker in workers[0:5]:
    df = data.query("worker_id == '%s'" % worker)
    model = fRL_Model(df, decay_weights=True, verbose=True)
    model.optimize()
    models.append(model)



Пример #32
0
import matplotlib.pyplot as plt
from os import path
import pandas as pd
import seaborn as sns
from selfregulation.utils.plot_utils import beautify_legend, format_num, format_variable_names
from selfregulation.utils.utils import get_behav_data, get_demographics, get_info

base_dir = get_info('base_directory')
ext = 'png'
data = get_behav_data()

# *************************************************************************
# Successful replications
# ************************************************************************
# two_stage
two_stage_df = get_behav_data(
    file='Individual_Measures/two_stage_decision.csv.gz')
# subset two subjects who passed quality control
successful_two_stage = data.filter(regex='two_stage').dropna(how='any').index
two_stage_df = two_stage_df.query('worker_id in %s' %
                                  list(successful_two_stage))
two_stage_df = two_stage_df.query('rt_first != -1 and feedback_last in [0,1]')
colors = sns.hls_palette(2)
plot_df = (1 - two_stage_df.groupby([
    'worker_id', 'stage_transition_last', 'feedback_last'
]).switch.mean()).reset_index()
plot_df.feedback_last = plot_df.feedback_last.replace({
    0: 'Unrewarded',
    1: 'Rewarded'
})
plot_df.stage_transition_last = \
import pandas as pd
import statsmodels.formula.api as smf
from selfregulation.utils.utils import get_behav_data
data = get_behav_data()
problem_subj = data.filter(regex='simon').isnull().iloc[:,0]
df = get_behav_data(file='Individual_Measures/simon.csv.gz')
df = df.query('exp_stage == "test" and rt==rt')

params = {}
for worker in df.worker_id.unique():
    if problem_subj.loc[worker] == True:
        continue
    subset = df.query('worker_id=="%s"' % worker)
    acc_contrast = subset.groupby('condition').correct.mean()
    acc_diff = rt_contrast['incongruent']-rt_contrast['congruent']
    subset = subset.query('correct == True')
    rs = smf.ols(formula = 'rt ~ C(condition, Sum)', data = subset).fit()
    params[worker] = rs.params.tolist()
    params[worker][1]*=-2
    rt_contrast = subset.groupby('condition').rt.median()
    diff = rt_contrast['incongruent']-rt_contrast['congruent']
    params[worker].append(diff)
    params[worker].append(acc_diff)
    
    
DVs = pd.DataFrame(params, index=['Intercept','model_diff', 'diff', 'acc_diff']).T


@author: ian
"""

from os import path
from dimensional_structure.utils import transfer_scores
from selfregulation.utils.utils import get_behav_data, get_info
from selfregulation.utils.result_utils import load_results

# get contextualizing results
results_dataset = 'Complete_03-29-2018'
results = load_results(datafile=results_dataset)

# get fmri data
fmri_dataset = 'Fmri_Followup_10-22-2018'
data = get_behav_data(dataset=fmri_dataset)
# remove data where participants are missing more than 20% of the tasks
tasks = data.copy()
tasks.columns = [i.split('.')[0] for i in tasks.columns]
successful_subjects = (tasks.isnull().reset_index().melt(id_vars=['index']) \
                       .groupby(['index', 'variable']).mean() \
                       .groupby('index').sum()<12)
successful_subjects = successful_subjects[successful_subjects['value']]
data = data.loc[successful_subjects.index]

task_scores = transfer_scores(data, results['task'])
survey_scores = transfer_scores(data, results['survey'])

# save the scores
basename = 'factorscores_results-%s.csv' % results_dataset
task_scores.to_csv(
Пример #35
0
    }
    dvs['taste_sensitivity'] = {
        'value': rs.params['taste_diff'],
        'valence': 'Neg'
    }
    description = """
        Both taste and health sensitivity are calculated based on the decision phase.
        On each trial the participant indicates whether they would prefer a food option
        over a reference food. Their choice is regressed on the subjective health and
        taste difference between that option and the reference item. Positive values
        indicate that the option's higher health/taste relates to choosing the option
        more often
    """
    return dvs, description


# get data
df = get_behav_data(dataset='Discovery_11-20-2016',
                    file='Individual_Measures/dietary_decision.csv.gz')
demo = get_behav_data(dataset='Discovery_11-20-2016',
                      file='demographic_targets.csv')

# calc DVs
DVs, description = calc_dietary_decision_DV(df)
for key, val in DVs.items():
    for subj_key in val.keys():
        val[subj_key] = val[subj_key]['value']
DVs = pandas.DataFrame.from_dict(DVs).T

# do it the simpler way
DV, valence, description = calc_exp_DVs(df)
def calc_EFA_retest_held_out(results, rotate='oblimin', verbose=True):
    name = results.ID.split('_')[0].title()
    orig_data = results.data
    positive_skewed = [
        i.replace('.logTr', '') for i in orig_data.columns if ".logTr" in i
    ]
    negative_skewed = [
        i.replace('.ReflogTr', '') for i in orig_data.columns
        if ".ReflogTr" in i
    ]
    DVs = [
        i.replace('.logTr', '').replace('.ReflogTr', '')
        for i in orig_data.columns
    ]
    orig_scores = results.EFA.get_scores(rotate=rotate)

    # load and clean retest data exactly like original data
    data_raw = get_behav_data(dataset=results.dataset,
                              file='meaningful_variables.csv')
    retest_data_raw = get_behav_data(dataset=results.dataset.replace(
        'Complete', 'Retest'),
                                     file='meaningful_variables.csv')
    shared_ids = set(retest_data_raw.index) & set(data_raw.index)
    data_raw = data_raw.loc[shared_ids, :]
    retest_data_raw = retest_data_raw.loc[shared_ids, :]
    raw_data = {'T1': data_raw, 'T2': retest_data_raw}
    imputed_data = {}
    for name, data in raw_data.items():
        tmp_data = data.loc[:, DVs]
        tmp_data = transform_remove_skew(tmp_data,
                                         positive_skewed=positive_skewed,
                                         negative_skewed=negative_skewed)
        tmp_data = remove_outliers(tmp_data)
        tmp_data_imputed, error = missForest(tmp_data)
        scaled_tmp_data = scale(tmp_data_imputed)
        imputed_data[name] = scaled_tmp_data

    # get subjects not in the retest set
    ind_data = orig_data.loc[set(orig_data.index) - shared_ids]
    fa, output = psychFA(ind_data,
                         results.EFA.results['num_factors'],
                         method='ml',
                         rotate=rotate)
    weights = get_attr(fa, 'weights')
    scores = {}
    for name, data in imputed_data.items():
        suffix = ''
        if name == 'T2': suffix = 'T2'
        tmp_scores = pd.DataFrame(
            data.dot(weights),
            index=shared_ids,
            columns=[i + ' ' + suffix for i in orig_scores.columns])
        scores[name] = tmp_scores
    combined = pd.concat([scores['T1'], scores['T2']], axis=1)
    cross_diag = [
        combined.corr().iloc[i, i + len(orig_scores.columns)]
        for i in range(len(orig_scores.columns))
    ]
    # get ICCs
    ICCs = []
    for col in scores['T1'].columns:
        tmp = combined.filter(regex=col)
        out = psych.ICC(tmp)
        ICCs.append(list(out[0][1])[-1])
    return combined, cross_diag, ICCs, (fa, output)
Пример #37
0
import numpy as np
from selfregulation.utils.utils import get_behav_data, get_recent_dataset

dataset = get_recent_dataset()
items = get_behav_data(dataset=dataset, file='items.csv.gz')
subject_items = get_behav_data(dataset=dataset, file='subject_x_items.csv')

# get fmri items alone
# filter for fmri items
grit_items = [
    'New ideas and projects sometimes distract me from previous ones.',
    'Setbacks don\'t discourage me.',
    'I have been obsessed with a certain idea or project for a short time but later lost interest.',
    'I am a hard worker.',
    'I often set a goal but later choose to pursue a different one.',
    'I have difficulty maintaining my focus on projects that take more than a few months to complete.',
    'I finish whatever I begin.', 'I am diligent.'
]

brief_items = [
    'I am good at resisting temptation.',
    'I have a hard time breaking bad habits.', 'I am lazy.',
    'I say inappropriate things.',
    'I do certain things that are bad for me, if they are fun.',
    'I refuse things that are bad for me.',
    'I wish I had more self-discipline.',
    'People would say that I have iron self-discipline.',
    'Pleasure and fun sometimes keep me from getting work done.',
    'I have trouble concentrating.',
    'I am able to work effectively toward long-term goals.',
    'Sometimes I can\'t stop myself from doing something, even if I know it is wrong.',
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

from dimensional_structure.prediction_utils import run_prediction
from selfregulation.utils.result_utils import load_results
from selfregulation.utils.utils import get_recent_dataset, get_behav_data

results = load_results(get_recent_dataset())
data = get_behav_data(file='variables_exhaustive.csv')

# get demographics
full_demog = results['task'].DA.get_scores()
full_demog.columns = ['full_' + c for c in full_demog.columns]
demog = pd.read_csv('/home/ian/Downloads/demog_fa_scores_t1.csv', index_col=0)

# get predictors
ddm_factors = pd.read_csv('/home/ian/Downloads/ez_t1_fa_3_scores.csv', index_col=0)
ontology_factors = results['task'].EFA.get_scores()
ontology_ddm_factors = ontology_factors[['Speeded IP', 'Caution', 'Perc / Resp']]

#
# compare demographics
diff = pd.DataFrame(demog.values - full_demog.loc[demog.index].values,
                    index=demog.index, columns=demog.columns)
corr = demog.join(full_demog).corr().iloc[:len(demog.columns), 
                                         len(demog.columns):]

# EZ vars 
EZ_vars = data.filter(regex='EZ_(non_decision|drift|thresh)$')
hddm_vars = data.filter(regex='hddm_(non_decision|drift|thresh)$')
                varNums=m['dataElements'][e]['subscaleVarNums']
            else:
                varNums=''
            if k=='eating_survey':
                varNums='custom(%s)'%varNums
            measure_level_data.append([vname,shortname,
                                    m['title'],m['dataElements'][e]['title'],
                                    m['measureType'],varNums])


measure_level_df=pandas.DataFrame(measure_level_data,
    columns=['ExpFactoryName','ShortName','MeasureName','VariableName',
                'MeasureType','SubscaleVarNums'])
# doublecheck that all meaningful variables are here
dataset=get_info('dataset')
behavdata=get_behav_data(dataset)
measurevars=measure_level_df.ExpFactoryName.tolist()
for v in behavdata.columns:
    assert v in measurevars
measure_level_df.to_csv('meaningful_variables_metadata.csv',index=False)

#save item level data
item_level_df=pandas.DataFrame(item_level_data,
    columns=['ExpFactoryName','MeasureName','QuestionNumber',
    'QuestionText','Scoring','ResponseOptions'])
item_level_df.to_csv('item_level_metadata.csv',index=False)

outcome_df=pandas.DataFrame(outcome_data,
    columns=['ExpFactoryName','MeasureName',
    'QuestionText','Scoring','ResponseOptions'])
outcome_df.to_csv('outcome_metadata.csv',index=False)
    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.text(5, 400, 'Completion Rate: %s' % completion_rate, size=20)
    ax.text(5, 350, 'Passed QC: %s' % analyzed_rate, size=20)
    plt.xlabel('Number of Tasks Completed', fontsize=20)
plt.savefig(save_dir, dpi=300, bbox_inches='tight')
"""

# ****************************************************************************
# plot psychometric reliability
# ****************************************************************************

sns.set_context('poster')
dataset = get_recent_dataset()
meaningful_vars = get_behav_data(dataset=dataset, 
                                 file='meaningful_variables_imputed.csv').columns
meaningful_vars = [i.replace('.logTr','') for i in meaningful_vars]
meaningful_vars = [i.replace('.ReflogTr','') for i in meaningful_vars]
retest_data = get_retest_data(dataset=dataset.replace('Complete','Retest'))
# only select meaningful variables
retest_data = retest_data.query('dv in %s' % list(meaningful_vars))

# create reliability dataframe
measure_cat = [get_var_category(v).title() for v in retest_data.index]
retest_data.loc[:,'Measure Category'] = measure_cat
Survey_N = np.sum(retest_data.loc[:, 'Measure Category']=='Survey')
Task_N = len(retest_data)-Survey_N

def plot_retest_data(retest_data, size=4.6, save_dir=None):
    colors = [sns.color_palette('Reds_d',3)[0], sns.color_palette('Blues_d',3)[0]]
    f = plt.figure(figsize=(size,size*.75))
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

from dimensional_structure.prediction_utils import run_prediction
from selfregulation.utils.result_utils import load_results
from selfregulation.utils.utils import get_recent_dataset, get_behav_data

results = load_results(get_recent_dataset())
data = get_behav_data(file='variables_exhaustive.csv')

# get demographics
full_demog = results['task'].DA.get_scores()
full_demog.columns = ['full_' + c for c in full_demog.columns]
demog = pd.read_csv('/home/ian/Downloads/demog_fa_scores_t1.csv', index_col=0)

# get predictors
ddm_factors = pd.read_csv('/home/ian/Downloads/ez_t1_fa_3_scores.csv',
                          index_col=0)
ontology_factors = results['task'].EFA.get_scores()
ontology_ddm_factors = ontology_factors[[
    'Speeded IP', 'Caution', 'Perc / Resp'
]]

#
# compare demographics
diff = pd.DataFrame(demog.values - full_demog.loc[demog.index].values,
                    index=demog.index,
                    columns=demog.columns)
corr = demog.join(full_demog).corr().iloc[:len(demog.columns),
                                          len(demog.columns):]
@author: ian
"""
import numpy as np
from os import makedirs, path
import pandas as pd
import pickle
from sklearn.covariance import GraphLassoCV
from sklearn.preprocessing import scale

from dimensional_structure.graph_utils import Graph_Analysis
from selfregulation.utils.utils import get_behav_data, get_recent_dataset
from selfregulation.utils.result_utils import load_results
from selfregulation.utils.r_to_py_utils import qgraph_cor
dataset = get_recent_dataset()
data = get_behav_data(dataset=dataset, file='meaningful_variables_imputed.csv')
all_results = load_results(dataset)


def get_EFA_HCA(results, EFA):
    if EFA == False:
        return results.HCA.results['data']
    else:
        c = results.EFA.results['num_factors']
        return results.HCA.results['EFA%s_oblimin' % c]


EFA = True
survey_HCA = get_EFA_HCA(all_results['survey'], EFA)
survey_order = survey_HCA['reorder_vec']
task_HCA = get_EFA_HCA(all_results['task'], EFA)
from expanalysis.experiments.psychological_models import MoE_Model
import numpy as np
import pandas as pd
from selfregulation.utils.utils import get_behav_data

data = get_behav_data(file='Individual_Measures/hierarchical_rule.csv.gz')
workers = data.worker_id.unique()
data = data.query("worker_id == '%s'" % workers[0])

        

from scipy.optimize import minimize
def eval_MoE(fit_args, passed_args):
    args = {
        'kappa': fit_args[0],
        'zeta': fit_args[1],
        'xi': fit_args[2],
        'beta2' : fit_args[3],
        'beta3': fit_args[4],
        'alphaC': fit_args[5],
        'alphaO': fit_args[6],
        'alphaS': fit_args[7],
        'beta_hierarchy': fit_args[8],
        'data': data,
        }
    
    MoE_model = MoE_Model(**args)
    likelihoods = []
    for i, trial in data.iterrows():
        if trial.key_press != -1:
Пример #44
0
# -*- coding: utf-8 -*-

from expanalysis.experiments.ddm_utils import get_HDDM_fun, load_model
from selfregulation.utils.utils import get_behav_data
import hddm
import numpy as np

# test HDDM calculation from processed task
task = 'stroop'
df = get_behav_data(file='Individual_Measures/%s.csv.gz' % task)
df = df.query('worker_id in %s' % list(df.worker_id.unique()[0:15]))

fun = get_HDDM_fun(task,
                   samples=20,
                   burn=10,
                   outfile='/home/ian/tmp/stroop',
                   parallel=True)
out = fun(df)

acc = df.groupby('worker_id').correct.mean()
rt = df.groupby('worker_id').rt.median()
for var in ['hddm_drift', 'hddm_thresh', 'hddm_non_decision']:
    ddm_vars = [out[k][var]['value'] for k in sorted(out.keys())]
    print(var)
    print('Correlation with Acc: ', np.corrcoef(acc, ddm_vars)[0, 1])
    print('Correlation with RT: ', np.corrcoef(rt, ddm_vars)[0, 1])

samples = 20
burn = 10
thin = 1
response_col = 'correct'
 def __init__(self, 
              datafile=None, 
              loading_thresh=None,
              dist_metric=distcorr,
              boot_iter=1000,
              name='',
              filter_regex='.',
              ID=None,
              results_dir=None,
              residualize_vars=['Age', 'Sex'],
              saved_obj_file=None
              ):
     """
     Args:
         datafile: name of a directory in "Data"
         loading_thresh: threshold to use for factor analytic result
         dist_metric: distance metric for hierarchical clustering that is 
         passed to pdist
         name: string to append to ID, default to empty string
         filter_regex: regex string passed to data.filter
         ID: specify if a specific ID is desired
         results_dir: where to save results
     """
     assert datafile is not None or saved_obj_file is not None
     # initialize with the saved object if available
     if saved_obj_file:
         self._load_init(saved_obj_file)
     else:
         # set vars
         self.dataset = datafile
         self.loading_thresh = None
         self.dist_metric = dist_metric
         self.boot_iter = boot_iter
         self.residualize_vars = residualize_vars
         if ID is None:
             self.ID =  '%s_%s' % (name, str(random.getrandbits(16)))
         else:
             self.ID = '%s_%s' % (name, str(ID))
         # set up output files
         self.results_dir = results_dir
         # load data
         self.data = get_behav_data(dataset=datafile, 
                                   file='meaningful_variables_imputed.csv',
                                   filter_regex=filter_regex,
                                   verbose=True)
         self.data_no_impute = get_behav_data(dataset=datafile,
                                              file='meaningful_variables_clean.csv',
                                              filter_regex=filter_regex,
                                              verbose=True)
         self.demographics = get_demographics()
         
     
     # initialize analysis classes
     self.DA = Demographic_Analysis(self.demographics, 
                                    residualize_vars=self.residualize_vars,
                                    boot_iter=self.boot_iter)
     self.EFA = EFA_Analysis(self.data, 
                             self.data_no_impute, 
                             boot_iter=self.boot_iter)
     self.HCA = HCA_Analysis(dist_metric=self.dist_metric)
     
     # load the results from the saved object
     if saved_obj_file:
         self._load_results(saved_obj_file)
import math
import matplotlib.pyplot as plt
import numpy as np
from os import path
import pandas as pd
import seaborn as sns
from selfregulation.utils.plot_utils import beautify_legend, format_num, format_variable_names
from selfregulation.utils.utils import filter_behav_data, get_behav_data, get_demographics, get_info

# correlation of ravens and literature
# replication of "Intelligence and socioeconomic success: A meta-analytic
# review of longitudinal research"

base_dir = get_info('base_directory')
ext = 'png'
data = get_behav_data()
demographics = get_demographics()
data = data.loc[demographics.index]
# get dataframe of intelligence measure (raven's progressive matrices) and demographics)
df = pd.concat([data.filter(regex='raven'), demographics], axis=1)

# get raven's reliability
reliability = get_behav_data(dataset='Retest_02-03-2018',
                             file='bootstrap_merged.csv.gz')
raven_reliability = reliability.groupby('dv').icc.mean().filter(
    regex='raven')[0]
# demographic reliabilities
demo_reliabilities = [1.0] * demographics.shape[1]

# correlations
correlations = df.corr().filter(regex='raven').sort_values(
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 10 18:47:50 2017

@author: ian
"""
import pickle
from expanalysis.experiments.processing import calc_exp_DVs
import pandas as pd
from selfregulation.utils.utils import get_behav_data

df = get_behav_data(file='Individual_Measures/two_stage_decision.csv.gz')

workers = list(df.worker_id.unique())

df = df.query('worker_id in %s' % workers)

DV_tests = []
for repeats in range(100):
    print(repeats)
    DVs, valence, description = calc_exp_DVs(df)
    DVs.columns = [i + '_run%s' % str(repeats) for i in DVs.columns]
    DV_tests.append(DVs)
DV_tests = pd.concat(DV_tests, axis=1)
DV_tests.to_pickle('two_stage_tests.pkl')

N = len(DV_tests.columns)
corr = DV_tests.corr()
DV_reliabilities = {}
for c in range(5):
# save gelman vals
pickle.dump(gelman_vals, open(path.join('hddm_output', 'gelman_vals.pkl'), 'wb'))



# *******************************************
# Overview of different ways we can assess HDDM convergence
# Useful functions:
    # PYMC: https://healthyalgorithms.com/2010/10/19/mcmc-in-python-how-to-stick-a-statistical-model-on-a-system-dynamics-model-in-pymc/
    # m.gen_stats: get stats matrix
    # m.print_stats: print stats matrix
    # m.dic_info: fit indices
# *******************************************
task = 'stroop'
N = 20 # number of workers
full_data = get_behav_data(file='Individual_Measures/%s.csv.gz' % task)

# let's only look at a few workers
data = full_data.query('worker_id in %s' % list(full_data.worker_id.unique()[0:N]))
outputs = run_model(task, data)
m=outputs[0]
# after the model is made is generates a .db file, a data file and a 'model' file
# the data file is a transformation of the data we put in - with different scales for
# rt, different columns, and some rows remove
data_in = pd.read_csv('%s_data.csv' % task)

# Example of extracting stats
# get stats for individual drift rates
stats= m.gen_stats().filter(regex='a_subj.', axis=0)
# the mc error reflects the variance around the estimate
stats['mc err']
Пример #49
0
    out = clf.coef_
    if len(out.shape) == 1:
        out = out.reshape(1, -1)
    out = pd.DataFrame(out, columns=scores.columns)
    out.index = data.columns
    return out


# do mapping
dataset = get_recent_dataset()
# load ontology
results = load_results(datafile=dataset)
task_loadings = results['task'].EFA.get_loading()
task_scores = results['task'].EFA.get_scores()
# load all DVs
all_DVs = get_behav_data(file='variables_exhaustive.csv', dataset=dataset)

contrast_loadings = {}
for contrast, relation in fmri_ontology_mapping.items():
    # if relation is in the direct mapping
    if relation.lstrip('-') in task_loadings.index:
        task_loading = task_loadings.loc[relation.lstrip('-')]
        if relation[0] == '-':
            task_loading = task_loading * -1
    # otherwise, reconstruct!
    else:
        unmapped_data = all_DVs.loc[:, relation.lstrip('-')]
        missing = unmapped_data[unmapped_data.isnull()].index
        task_loading = run_linear(pd.DataFrame(unmapped_data.drop(missing)),
                                  task_scores.drop(missing)).iloc[0, :]
    contrast_loadings[contrast] = task_loading
import os
import numpy as np
from selfregulation.utils.utils import get_behav_data
import seaborn as sns

#Make Plot Directy if it doesn't exist
if not os.path.exists('Plots'):
    os.mkdir('Plots')

# get DV df
DV_df = get_behav_data()
tasks = np.unique(DV_df.columns.map(lambda x: x.split('.')[0]))

for task in tasks:
    subset = DV_df.filter(regex = '^%s' % task)
    subset = subset.dropna(how = 'all').dropna(axis = 1)
    sns.set(font_scale = 1.5)
    p = sns.pairplot(subset, kind = 'reg', size = 5, diag_kws = {'bins': 50})
    p.savefig('Plots/%s_pair_plot.pdf' % task, dpi = 300)

@author: ian
"""

from os import path
from dimensional_structure.utils import transfer_scores
from selfregulation.utils.utils import get_behav_data, get_info
from selfregulation.utils.result_utils import load_results

# get contextualizing results
results_dataset = 'Complete_03-29-2018'
results = load_results(datafile=results_dataset)

# get fmri data
fmri_dataset= 'Fmri_Followup_10-22-2018'
data = get_behav_data(dataset=fmri_dataset)
# remove data where participants are missing more than 20% of the tasks
tasks = data.copy()
tasks.columns = [i.split('.')[0] for i in tasks.columns]
successful_subjects = (tasks.isnull().reset_index().melt(id_vars=['index']) \
                       .groupby(['index', 'variable']).mean() \
                       .groupby('index').sum()<12)
successful_subjects = successful_subjects[successful_subjects['value']]
data = data.loc[successful_subjects.index]

task_scores = transfer_scores(data, results['task'])
survey_scores = transfer_scores(data, results['survey'])

# save the scores
basename = 'factorscores_results-%s.csv' % results_dataset
task_scores.to_csv(path.join(get_info('base_directory'),