def get_fmri_pay(data): assert 'ontask_time' in data.columns, \ 'Task time not found. Must run "calc_time_taken" first.' all_exps = data.experiment_exp_id.unique() exps_completed = data.groupby('worker_id').experiment_exp_id.unique() exps_not_completed = exps_completed.map(lambda x: list( set(all_exps) - set(x) - set(['selection_optimization_compensation']))) completed = exps_completed[exps_completed.map(lambda x: len(x) >= 63)] not_completed = exps_not_completed[exps_not_completed.map( lambda x: len(x) > 0)] # calculate time taken # get time taken for each task from previous mturk sample time_path = os.path.join(get_info('base_directory'), 'references', 'experiment_lengths.json') task_time = json.load(open(time_path)) time_missed = exps_not_completed.map(lambda x: np.sum( [task_time[i] if task_time[i] is not None else 3 for i in x]) / 60) # calculate pay completed_pay = pd.Series(data=100, index=completed.index) prorate_pay = 100 - time_missed[not_completed.index] * 10 #remove anyone who was double counted pay = pd.concat([completed_pay, prorate_pay ]).map(lambda x: round(x, 1)).to_frame(name='base') pay['bonuses'] = get_bonuses(data, 15, 10) pay['total'] = pay.sum(axis=1) return pay
def convert_var_names(to_convert): '''Convert array of variable names or columns/index of a dataframe. Assumes that all values either come from short of long variable names. If a dataframe is passed, variable conversion is done in place. ''' assert(isinstance(to_convert, (list, np.ndarray, pd.DataFrame))), \ 'Object to convert must be a list, numpy array or pandas DataFrame' reference_location = os.path.join(get_info('base_directory'), 'references', 'variable_name_lookup.csv') var_lookup = pd.Series.from_csv(reference_location) inverse_lookup = pd.Series(index = var_lookup.values, data = var_lookup.index) if type(to_convert) == pd.DataFrame: # convert columns if there are dependent variable names if to_convert.columns[0] in var_lookup: new_columns = [var_lookup.loc[c] if c in var_lookup.index else c for c in to_convert.columns] elif to_convert.columns[0] in inverse_lookup: new_columns = [inverse_lookup.loc[c] if c in inverse_lookup.index else c for c in to_convert.columns] else: new_columns = to_convert.columns to_convert.columns = new_columns # convert index if there are dependent variable names if to_convert.index[0] in var_lookup: new_index = [var_lookup.loc[i] if i in var_lookup.index else i for i in to_convert.index] elif to_convert.index[0] in inverse_lookup: new_index = [inverse_lookup.loc[i] if i in inverse_lookup.index else i for i in to_convert.index] else: new_index = to_convert.index to_convert.index = new_index elif isinstance(to_convert, (list, np.ndarray)): if to_convert[0] in var_lookup: return [var_lookup.loc[c] if c in var_lookup.index else c for c in to_convert] elif to_convert[0] in inverse_lookup: return [inverse_lookup.loc[c] if c in inverse_lookup.index else c for c in to_convert]
def get_plot_dir(self): if self.results_dir is None: results_dir = get_info('results_directory') else: results_dir = self.results_dir plot_dir = path.join(results_dir, 'dimensional_structure', self.dataset, 'Plots', self.ID) makedirs(plot_dir, exist_ok = True) return plot_dir
def get_demographic_model_type(demographics, verbose=False): base = get_info('base_directory') R.source(os.path.join(base, 'selfregulation', 'utils', 'utils.R')) get_vartypes = robjects.globalenv['get_vartypes'] out=get_vartypes(demographics, verbose) model_types = pandas.DataFrame(numpy.reshape(numpy.matrix(out),(-1,2), 'F')) model_types.iloc[:, 0] = demographics.columns return model_types
def get_demographic_model_type(demographics, verbose=False): base = get_info('base_directory') R.source(os.path.join(base, 'selfregulation', 'utils', 'utils.R')) get_vartypes = robjects.globalenv['get_vartypes'] out = get_vartypes(demographics, verbose) model_types = pandas.DataFrame( numpy.reshape(numpy.matrix(out), (-1, 2), 'F')) model_types.iloc[:, 0] = demographics.columns return model_types
def load_results(datafile, name=None, results_dir=None): if results_dir is None: results_dir = get_info('results_directory') results = {} result_files = glob(os.path.join(results_dir, 'dimensional_structure/%s/Output/*results.pkl' % (datafile))) if name is not None: result_files = [i for i in result_files if name in i] for filey in result_files: name = os.path.basename(filey).split('_')[0] results[name] = Results(saved_obj_file=filey) return results
def convert_var_names(to_convert): '''Convert array of variable names or columns/index of a dataframe. Assumes that all values either come from short of long variable names. If a dataframe is passed, variable conversion is done in place. ''' assert(isinstance(to_convert, (list, np.ndarray, pd.DataFrame))), \ 'Object to convert must be a list, numpy array or pandas DataFrame' reference_location = os.path.join(get_info('base_directory'), 'references', 'variable_name_lookup.csv') var_lookup = pd.Series.from_csv(reference_location) inverse_lookup = pd.Series(index=var_lookup.values, data=var_lookup.index) if type(to_convert) == pd.DataFrame: # convert columns if there are dependent variable names if to_convert.columns[0] in var_lookup: new_columns = [ var_lookup.loc[c] if c in var_lookup.index else c for c in to_convert.columns ] elif to_convert.columns[0] in inverse_lookup: new_columns = [ inverse_lookup.loc[c] if c in inverse_lookup.index else c for c in to_convert.columns ] else: new_columns = to_convert.columns to_convert.columns = new_columns # convert index if there are dependent variable names if to_convert.index[0] in var_lookup: new_index = [ var_lookup.loc[i] if i in var_lookup.index else i for i in to_convert.index ] elif to_convert.index[0] in inverse_lookup: new_index = [ inverse_lookup.loc[i] if i in inverse_lookup.index else i for i in to_convert.index ] else: new_index = to_convert.index to_convert.index = new_index elif isinstance(to_convert, (list, np.ndarray)): if to_convert[0] in var_lookup: return [ var_lookup.loc[c] if c in var_lookup.index else c for c in to_convert ] elif to_convert[0] in inverse_lookup: return [ inverse_lookup.loc[c] if c in inverse_lookup.index else c for c in to_convert ]
def gen_reference_item_text(items_df): base_directory = get_info('base_directory') reference_location = os.path.join(base_directory,'references','variable_name_lookup.csv') ref = pd.read_csv(reference_location) # add item text item_text_lookup = items_df.groupby('item_ID').item_text.unique().apply(lambda x: x[0]).to_dict() item_text = [item_text_lookup[i] if i in item_text_lookup.keys() else np.nan for i in ref['Variable Name']] # add response text response_text_lookup = items_df.groupby('item_ID').response_text.unique().apply(lambda x: ', '.join(x)) response_text = [response_text_lookup[i].replace('\n','') if i in response_text_lookup.keys() else np.nan for i in ref['Variable Name']] ref.loc[:,'Question'] = item_text ref.loc[:,'Responses'] = response_text ref.to_csv(reference_location, index = False)
def get_fmri_pay(data): assert 'ontask_time' in data.columns, \ 'Task time not found. Must run "calc_time_taken" first.' all_exps = data.experiment_exp_id.unique() exps_completed = data.groupby('worker_id').experiment_exp_id.unique() exps_not_completed = exps_completed.map(lambda x: list(set(all_exps) - set(x) - set(['selection_optimization_compensation']))) completed = exps_completed[exps_completed.map(lambda x: len(x)>=63)] not_completed = exps_not_completed[exps_not_completed.map(lambda x: len(x)>0)] # calculate time taken # get time taken for each task from previous mturk sample time_path = os.path.join(get_info('base_directory'),'references','experiment_lengths.json') task_time = json.load(open(time_path)) time_missed = exps_not_completed.map(lambda x: np.sum([task_time[i] if task_time[i] is not None else 3 for i in x])/60) # calculate pay completed_pay = pd.Series(data = 100, index = completed.index) prorate_pay = 100-time_missed[not_completed.index]*10 #remove anyone who was double counted pay= pd.concat([completed_pay, prorate_pay]).map(lambda x: round(x,1)).to_frame(name = 'base') pay['bonuses'] = get_bonuses(data, 15, 10) pay['total'] = pay.sum(axis = 1) return pay
def gen_reference_item_text(items_df): base_directory = get_info('base_directory') reference_location = os.path.join(base_directory, 'references', 'variable_name_lookup.csv') ref = pd.read_csv(reference_location) # add item text item_text_lookup = items_df.groupby('item_ID').item_text.unique().apply( lambda x: x[0]).to_dict() item_text = [ item_text_lookup[i] if i in item_text_lookup.keys() else np.nan for i in ref['Variable Name'] ] # add response text response_text_lookup = items_df.groupby( 'item_ID').response_text.unique().apply(lambda x: ', '.join(x)) response_text = [ response_text_lookup[i].replace('\n', '') if i in response_text_lookup.keys() else np.nan for i in ref['Variable Name'] ] ref.loc[:, 'Question'] = item_text ref.loc[:, 'Responses'] = response_text ref.to_csv(reference_location, index=False)
from expanalysis.experiments.utils import remove_duplicates, result_filter from expanalysis.results import get_filters, get_result_fields from expanalysis.results import Result from os import path, makedirs import pickle from selfregulation.utils.utils import get_info #set token and data directory token = get_info('expfactory_token', infile='/oak/stanford/groups/russpold/users/zenkavi/Self_Regulation_Ontology/Self_Regulation_Retest_Settings.txt') data_dir=path.join('/oak/stanford/groups/russpold/users/zenkavi/Self_Regulation_Ontology/Data/','Retest_01-23-2018', 'Local') if not path.exists(data_dir): makedirs(data_dir) # Set up filters filters = get_filters() drop_columns = ['battery_description', 'experiment_reference', 'experiment_version', \ 'experiment_name','experiment_cognitive_atlas_task'] for col in drop_columns: filters[col] = {'drop': True} # Strip token from specified file f = open(token) access_token = f.read().strip() # Set up variables for the download request battery = 'Self Regulation Retest Battery' url = 'http://www.expfactory.org/new_api/results/62/'
find_intersection, get_fully_connected_threshold, remove_island_variables from selfregulation.utils.graph_utils import Graph_Analysis, threshold, \ threshold_proportional_sign from selfregulation.utils.utils import get_behav_data, get_info import bct import igraph import numpy as np from os.path import join, exists from os import makedirs import pandas as pd import seaborn as sns # generic variables save_plots = False plot_dir = join(get_info('base_directory'), 'dimensional_structure', 'Plots') # get dependent variables graph_data = get_behav_data(file='taskdata_imputed.csv') def run_graph_analysis(adj_dict, save_plots=False): """ Takes in a dictionary with two keys: "name" and "adj", specifying an adjacency matrix (as a dataframe) and its corresponding name """ def plot_name(name): return join(plot_dir, adj_name, name) adj_name = adj_dict['name'] adj = adj_dict['adj']
default='all') parser.add_argument( '--sample', help= 'Specifies what sample to run. Options: "discovery", "validation", "incomplete").', nargs='+', default=['discovery', 'validation', 'incomplete']) # get options args = parser.parse_args() job = args.job sample = args.sample print('Running Script. Job %s, sample: %s' % (job, sample)) #load Data token = get_info('expfactory_token') try: data_dir = get_info('data_directory') except Exception: data_dir = path.join(get_info('base_directory'), 'Data') if job == 'download' or job == "all": print('Beginning "Download"') #*************************************************** # ********* Load Data ********************** #************************************************** pd.set_option('display.width', 200) figsize = [16, 12] #set up filters filters = get_filters() drop_columns = ['battery_description', 'experiment_reference', 'experiment_version', \
from selfregulation.utils.utils import get_info parser = argparse.ArgumentParser( description='fMRI Analysis Entrypoint Script.') parser.add_argument( '--job', help= 'Specifies what part of the script to run. Options: download, extras, post, all").', default='post') # get options args = parser.parse_args() job = args.job #load Data token = get_info('expfactory_token') try: data_dir = get_info('data_directory') except Exception: data_dir = path.join(get_info('base_directory'), 'Data') if job == 'download' or job == "all": #*************************************************** # ********* Load Data ********************** #************************************************** pd.set_option('display.width', 200) figsize = [16, 12] #set up filters filters = get_filters() drop_columns = ['battery_description', 'experiment_reference', 'experiment_version', \ 'experiment_name','experiment_cognitive_atlas_task']
from expanalysis.experiments.utils import remove_duplicates, result_filter from expanalysis.results import get_filters, get_result_fields from expanalysis.results import Result from os import path, makedirs import pickle from selfregulation.utils.utils import get_info #set token and data directory token = get_info( 'expfactory_token', infile= '/oak/stanford/groups/russpold/users/zenkavi/Self_Regulation_Ontology/Self_Regulation_Retest_Settings.txt' ) data_dir = path.join( '/oak/stanford/groups/russpold/users/zenkavi/Self_Regulation_Ontology/Data/', 'Retest_01-23-2018', 'Local') if not path.exists(data_dir): makedirs(data_dir) # Set up filters filters = get_filters() drop_columns = ['battery_description', 'experiment_reference', 'experiment_version', \ 'experiment_name','experiment_cognitive_atlas_task'] for col in drop_columns: filters[col] = {'drop': True} # Strip token from specified file f = open(token)
parser.add_argument('-j',"--n_jobs", help="number of processors",type=int, default=2) parser.add_argument('-w',"--workdir", help="working directory") parser.add_argument('-r',"--resultsdir", help="results directory") parser.add_argument("--singlevar", nargs='*',help="run with single variables") parser.add_argument('--imputer',help='imputer to use', default='SimpleFill') parser.add_argument("--smote_threshold", help="threshold for applying smote (distance from 0.5)", type=float,default=0.05) args=parser.parse_args() # parameters to set if args.resultsdir is None: try: output_base=get_info('results_directory') except: output_base='.' else: output_base=args.resultsdir output_dir=os.path.join(output_base,'prediction_outputs') if not os.path.exists(output_dir): os.makedirs(output_dir) #assert args.dataset in ['survey','mirt','task','all','baseline'] assert args.classifier in ['lasso','rf'] # don't regress out baseline vars for baseline model if args.dataset=='baseline' or args.no_baseline_vars: baselinevars=False if args.verbose: print("turning off inclusion of baseline vars")
from dimensional_structure.utils import transfer_scores from selfregulation.utils.utils import get_behav_data, get_info from selfregulation.utils.result_utils import load_results # get contextualizing results results_dataset = 'Complete_03-29-2018' results = load_results(datafile=results_dataset) # get fmri data fmri_dataset = 'Fmri_Followup_10-22-2018' data = get_behav_data(dataset=fmri_dataset) # remove data where participants are missing more than 20% of the tasks tasks = data.copy() tasks.columns = [i.split('.')[0] for i in tasks.columns] successful_subjects = (tasks.isnull().reset_index().melt(id_vars=['index']) \ .groupby(['index', 'variable']).mean() \ .groupby('index').sum()<12) successful_subjects = successful_subjects[successful_subjects['value']] data = data.loc[successful_subjects.index] task_scores = transfer_scores(data, results['task']) survey_scores = transfer_scores(data, results['survey']) # save the scores basename = 'factorscores_results-%s.csv' % results_dataset task_scores.to_csv( path.join(get_info('base_directory'), 'Data', fmri_dataset, 'task_' + basename)) survey_scores.to_csv( path.join(get_info('base_directory'), 'Data', fmri_dataset, 'survey_' + basename))
find_intersection, get_fully_connected_threshold, remove_island_variables from selfregulation.utils.graph_utils import Graph_Analysis, threshold, \ threshold_proportional_sign from selfregulation.utils.utils import get_behav_data, get_info import bct import igraph import numpy as np from os.path import join, exists from os import makedirs import pandas as pd import seaborn as sns # generic variables save_plots = False plot_dir = join(get_info('base_directory'),'dimensional_structure','Plots') # get dependent variables graph_data = get_behav_data(file = 'taskdata_imputed.csv') def run_graph_analysis(adj_dict, save_plots=False): """ Takes in a dictionary with two keys: "name" and "adj", specifying an adjacency matrix (as a dataframe) and its corresponding name """ def plot_name(name): return join(plot_dir,adj_name,name) adj_name = adj_dict['name']
from selfregulation.utils.utils import get_info, get_recent_dataset, get_retest_data from selfregulation.utils.result_utils import load_results # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') # import matplotlib.pyplot as plt from matplotlib import ticker import matplotlib.gridspec as gridspec import matplotlib.pyplot as plt import seaborn as sns # In[ ]: dataset = get_recent_dataset() results_dir = get_info('results_directory') ontology_results_dir = path.join(results_dir, 'ontology_reconstruction', dataset, '*', 'oblimin') retest_data = get_retest_data(dataset.replace('Complete', 'Retest')) plot_dir = glob(path.join(ontology_results_dir, 'Plots'))[0] save = True # In[ ]: results = load_results(dataset)['task'] c = results.EFA.get_c() # # Load Reconstructions # In[ ]:
#!/usr/bin/env python3 import argparse from expanalysis.experiments.processing import get_exp_DVs from glob import glob from os import path import pandas as pd from selfregulation.utils.utils import get_info try: data_dir=get_info('data_directory') except Exception: data_dir=path.join(get_info('base_directory'),'Data') # parse arguments parser = argparse.ArgumentParser() parser.add_argument('exp_id') parser.add_argument('data') parser.add_argument('--no_group', action='store_false') # HDDM params parser.add_argument('--out_dir', default=data_dir) parser.add_argument('--hddm_samples', default=None, type=int) parser.add_argument('--hddm_burn', default=None, type=int) parser.add_argument('--hddm_thin', default=None, type=int) parser.add_argument('--no_parallel', action='store_false') parser.add_argument('--num_cores', default=None, type=int) parser.add_argument('--mode', default=None, type=str) args = parser.parse_args()
from collections import OrderedDict as odict from os import makedirs, path import numpy as np import pandas as pd import pickle from ideological_prediction.plot_utils import (plot_RSA, plot_outcome_ontological_similarity, plot_prediction, plot_prediction_scatter, importance_bar_plots, importance_polar_plots, plot_predictors_comparison) from selfregulation.utils.utils import get_info results_dir = path.join(get_info('results_directory'), 'ideology_prediction') plot_dir = path.join(results_dir, 'Plots') makedirs(plot_dir, exist_ok=True) # load predictions rotate='oblimin' ext = 'pdf' data = pickle.load(open(path.join(results_dir, 'ideo_predictions.pkl'), 'rb')) all_predictions = data['all_predictions'] all_shuffled_predictions = data['all_shuffled_predictions'] predictors = data['predictors'] targets = data['targets'] RSA = {} # plot RSA for ideological variables ideological_correlations = {}
from os import path import pandas as pd from selfregulation.utils.data_preparation_utils import calc_trial_order, \ convert_date, convert_fmri_ids, download_data, get_bonuses, get_fmri_pay, \ quality_check_correction from selfregulation.utils.utils import get_info parser = argparse.ArgumentParser(description='fMRI Analysis Entrypoint Script.') parser.add_argument('--job', help='Specifies what part of the script to run. Options: download, extras, post, all").', default='post') # get options args = parser.parse_args() job = args.job #load Data token = get_info('expfactory_token') try: data_dir=get_info('data_directory') except Exception: data_dir=path.join(get_info('base_directory'),'Data') if job == 'download' or job == "all": #*************************************************** # ********* Load Data ********************** #************************************************** pd.set_option('display.width', 200) figsize = [16,12] #set up filters filters = get_filters() drop_columns = ['battery_description', 'experiment_reference', 'experiment_version', \ 'experiment_name','experiment_cognitive_atlas_task']
file in the same folder as these metadata files. """ from metadata_validator import validate_exp from collections import OrderedDict import pandas,unicodedata import os,pickle,sys,math import json from selfregulation.utils.utils import get_info,get_behav_data,get_item_metadata,get_var_category from measure_dictionaries import measure_longnames,measure_termurls,measure_sobcurls basedir=get_info('base_directory') dataset=get_info('dataset') outdir=os.path.join(basedir,'Data/Derived_Data/%s'%dataset) def get_subscale_vars(): subscale_data=pandas.read_csv('../references/survey_subscale_reference.csv', index_col=0) subscale_var_dict={} for v in subscale_data.index: if subscale_data.loc[v].iloc[2]=='sum items': subscale_var_dict[v]='SumAll' elif subscale_data.loc[v].iloc[2]=='mean items': subscale_var_dict[v]='MeanAll' else: d=[] for i in subscale_data.loc[v]:
varNums = '' if k == 'eating_survey': varNums = 'custom(%s)' % varNums measure_level_data.append([ vname, shortname, m['title'], m['dataElements'][e]['title'], m['measureType'], varNums ]) measure_level_df = pandas.DataFrame(measure_level_data, columns=[ 'ExpFactoryName', 'ShortName', 'MeasureName', 'VariableName', 'MeasureType', 'SubscaleVarNums' ]) # doublecheck that all meaningful variables are here dataset = get_info('dataset') behavdata = get_behav_data(dataset) measurevars = measure_level_df.ExpFactoryName.tolist() for v in behavdata.columns: assert v in measurevars measure_level_df.to_csv('meaningful_variables_metadata.csv', index=False) #save item level data item_level_df = pandas.DataFrame(item_level_data, columns=[ 'ExpFactoryName', 'MeasureName', 'QuestionNumber', 'QuestionText', 'Scoring', 'ResponseOptions' ]) item_level_df.to_csv('item_level_metadata.csv', index=False)
#!/usr/bin/env python3 from os import path import pandas as pd from selfregulation.utils.utils import get_info try: data_dir = get_info('data_directory') except Exception: data_dir = path.join(get_info('base_directory'), 'Data') complete = None # concatenate discovery and validation data into one complete discovery_path = path.join(data_dir, 'mturk_discovery_data_post.pkl') validation_path = path.join(data_dir, 'mturk_validation_data_post.pkl') complete_path = path.join(data_dir, 'mturk_complete_data_post.pkl') if path.exists(discovery_path) and path.exists(validation_path): discovery = pd.read_pickle(discovery_path) validation = pd.read_pickle(validation_path) complete = pd.concat([discovery, validation]) complete.to_pickle(complete_path) # separate complete into two data subsets for particularly memory intensive analyses (DDM) if path.exists(complete_path): if not complete: complete = pd.read_pickle(complete_path) workers = complete.worker_id.unique() mid = len(workers) // 2 subset1 = complete.query('worker_id in %s' % list(workers)[:mid]) subset2 = complete.query('worker_id in %s' % list(workers)[mid:]) subset1.to_pickle( path.join(data_dir, 'mturk_complete_subset1_data_post.pkl'))
import matplotlib.pyplot as plt import numpy as np from os import makedirs, path import pandas as pd from scipy.spatial.distance import squareform from sklearn.manifold import MDS import seaborn as sns from dimensional_structure.HCA_plots import abs_pdist from selfregulation.utils.result_utils import load_results from selfregulation.utils.utils import get_info, get_recent_dataset # get dataset of interest basedir = get_info('base_directory') dataset = get_recent_dataset() dataset = path.join(basedir, 'Data', dataset) datafile = dataset.split(path.sep)[-1] # load data results = load_results(datafile) data = results['task'].data out = results['task'].EFA.get_loading() nfactors = out.shape[1] task_subset = pd.concat([ out.filter(regex='choice_reaction_time', axis=0), out.filter(regex='^stop_signal\.(hddm|SSRT)', axis=0)[1:5] ]) task_subset_data = data.loc[:, task_subset.index] task_variables = list(task_subset.index) plot_dir = output_dir = path.join(get_info('results_directory'), 'ontology_reconstruction', results['task'].ID, 'Plots')
import matplotlib.pyplot as plt import numpy as np from os import path import pandas as pd import seaborn as sns from shutil import copyfile from selfregulation.utils.utils import (get_behav_data, get_info, get_recent_dataset, get_retest_data, get_var_category) from selfregulation.utils.plot_utils import format_num sns.set_palette("Set1", 8, .75) base_dir = get_info('base_directory') ext = 'pdf' dpi = 300 # Raw Data Plots """ # Load data if plots need to be regenerated post_process_data_loc = '' data = pd.load_pickle(post_process_data_loc) # plt total time on tasks (data.groupby('worker_id').ontask_time.sum()/3600).hist(bins=40, grid=False, density=True, figsize=(12,8)) plt.xlabel('Time (Hours)') plt.title('Total Time on Tasks', weight='bold')
#!/usr/bin/env python3 """ export metdata to csv for Mackinnon group """ import os,pickle,sys import json from selfregulation.utils.utils import get_info,get_behav_data basedir=get_info('base_directory') dataset=get_info('dataset') print('using dataset:',dataset) datadir=os.path.join(basedir,'Data/%s'%dataset) outdir=os.path.join(basedir,'Data/Derived_Data/%s'%dataset) metadata=pickle.load(open(os.path.join(outdir,'survey_metadata.pkl'),'rb')) surveys=list(metadata.keys()) surveys.sort() with open(os.path.join(outdir,'survey_metadata.tsv'),'w') as f: for s in surveys: items=list(metadata[s].keys()) items.sort() items.remove('MeasurementToolMetadata') for i in items: print(metadata[s][i]) levels=list(metadata[s][i]['Levels'].keys()) levels.sort() options='\t'.join(['%s:%s'%(k,metadata[s][i]['Levels'][k]) for k in levels]) f.write('%s\t%s\t%s\n'%(i, metadata[s][i]['Description'], options))
if 'subscaleVarNums' in m['dataElements'][e]: varNums=m['dataElements'][e]['subscaleVarNums'] else: varNums='' if k=='eating_survey': varNums='custom(%s)'%varNums measure_level_data.append([vname,shortname, m['title'],m['dataElements'][e]['title'], m['measureType'],varNums]) measure_level_df=pandas.DataFrame(measure_level_data, columns=['ExpFactoryName','ShortName','MeasureName','VariableName', 'MeasureType','SubscaleVarNums']) # doublecheck that all meaningful variables are here dataset=get_info('dataset') behavdata=get_behav_data(dataset) measurevars=measure_level_df.ExpFactoryName.tolist() for v in behavdata.columns: assert v in measurevars measure_level_df.to_csv('meaningful_variables_metadata.csv',index=False) #save item level data item_level_df=pandas.DataFrame(item_level_data, columns=['ExpFactoryName','MeasureName','QuestionNumber', 'QuestionText','Scoring','ResponseOptions']) item_level_df.to_csv('item_level_metadata.csv',index=False) outcome_df=pandas.DataFrame(outcome_data, columns=['ExpFactoryName','MeasureName', 'QuestionText','Scoring','ResponseOptions'])
else: dataset = get_recent_dataset() # In[ ]: # additional setup np.random.seed(12412) results = load_results(dataset)['task'] c = results.EFA.results['num_factors'] classifiers = {'Ridge': Ridge(fit_intercept=False), 'LR': LinearRegression(fit_intercept=False)} # get output dir to store results output_dir = path.join(get_info('results_directory'), 'ontology_reconstruction', results.ID) makedirs(output_dir, exist_ok=True) # In[ ]: # get a random subset of variables to perform the calculation on if n_vars is set if n_vars is not None: var_list = np.random.choice(results.data.columns, n_vars, replace=False) else: var_list = results.data.columns # Run simulation for every variable at different population sizes.
import matplotlib.pyplot as plt import numpy as np from os import makedirs, path import pandas as pd from scipy.spatial.distance import squareform from sklearn.manifold import MDS import seaborn as sns from dimensional_structure.HCA_plots import abs_pdist from selfregulation.utils.result_utils import load_results from selfregulation.utils.utils import get_info, get_recent_dataset # get dataset of interest basedir=get_info('base_directory') dataset = get_recent_dataset() dataset = path.join(basedir,'Data',dataset) datafile = dataset.split(path.sep)[-1] # load data results = load_results(datafile) data = results['task'].data out = results['task'].EFA.get_loading() nfactors = out.shape[1] task_subset = pd.concat([ out.filter(regex='choice_reaction_time', axis=0), out.filter(regex='^stop_signal\.(hddm|SSRT)', axis=0)[1:5]]) task_subset_data = data.loc[:, task_subset.index] task_variables = list(task_subset.index) plot_dir = output_dir = path.join(get_info('results_directory'), 'ontology_reconstruction', results['task'].ID, 'Plots') makedirs(plot_dir, exist_ok=True)
# ## Additional Setup # In[ ]: # Load dataset np.random.seed(12412) results = load_results(dataset)[result_subset] c = results.EFA.get_c() # Define classifiers classifiers = { 'Ridge': Ridge(fit_intercept=False), 'LR': LinearRegression(fit_intercept=False) } # get output dir to store results output_dir = path.join(get_info('results_directory'), 'ontology_reconstruction', dataset, results.ID, EFA_rotation) makedirs(output_dir, exist_ok=True) # get plot dir to store plots plot_dir = path.join(output_dir, 'Plots') makedirs(plot_dir, exist_ok=True) # In[ ]: # get a random subset of variables to perform the calculation on if n_vars is set measures = np.unique([i.split('.')[0] for i in results.data.columns]) if n_measures is not None: measure_list = np.random.choice(measures, n_measures, replace=False) else: measure_list = measures
import sys, os import random import pickle import pandas, numpy from selfregulation.utils.utils import get_info, get_behav_data import fancyimpute import matplotlib.pyplot as plt dataset = 'Complete_10-27-2017' basedir = get_info('base_directory') nruns = int(sys.argv[1]) outdir = sys.argv[2] datafile = sys.argv[3] if not os.path.exists(outdir): os.mkdir(outdir) #datafile=os.path.join(basedir, # 'Data/Derived_Data/%s/behavdata_imputed_cleaned.csv'%dataset) df = pandas.read_csv(datafile) # shuffle, recompute, and store maximum for each run def col_shuffle(df, test=False): """ shuffle data within each column """ if test: return (df) df_shuf = df.copy()
from selfregulation.utils.result_utils import load_results # get contextualizing results results_dataset = 'Complete_03-29-2018' results = load_results(datafile=results_dataset) # get fmri data fmri_dataset= 'Fmri_Followup_10-22-2018' data = get_behav_data(dataset=fmri_dataset) # remove data where participants are missing more than 20% of the tasks tasks = data.copy() tasks.columns = [i.split('.')[0] for i in tasks.columns] successful_subjects = (tasks.isnull().reset_index().melt(id_vars=['index']) \ .groupby(['index', 'variable']).mean() \ .groupby('index').sum()<12) successful_subjects = successful_subjects[successful_subjects['value']] data = data.loc[successful_subjects.index] task_scores = transfer_scores(data, results['task']) survey_scores = transfer_scores(data, results['survey']) # save the scores basename = 'factorscores_results-%s.csv' % results_dataset task_scores.to_csv(path.join(get_info('base_directory'), 'Data', fmri_dataset, 'task_'+basename)) survey_scores.to_csv(path.join(get_info('base_directory'), 'Data', fmri_dataset, 'survey_'+basename))