def extract_row(row, clean = True, apply_post = True, drop_columns = None): '''Returns a dataframe that has expanded the data of one row of a results object :row: one row of a Results data dataframe :param clean: boolean, if true call clean_df on the data :param drop_columns: list of columns to pass to clean_df :param drop_na: boolean to pass to clean_df :return df: dataframe containing the extracted experiment ''' exp_id = row['experiment_exp_id'] if row.get('process_stage') == 'post': zfill_length = len(str(len(row['data']['index']))) df = pandas.DataFrame(row['data']['trialdata']) df.columns = row['data']['columns'] df.index = ['_'.join(t)+'_'+n.zfill(zfill_length) for *t,n in [i.split('_') for i in row['data']['index']]] df.sort_index(inplace = True) if clean == True: df = clean_data(df, row['experiment_exp_id'], False, drop_columns) else: exp_data = get_data(row) for trial in exp_data: trial['battery_name'] = row['battery_name'] trial['experiment_exp_id'] = row['experiment_exp_id'] trial['worker_id'] = row['worker_id'] trial['finishtime'] = row['finishtime'] df = pandas.DataFrame(exp_data) zfill_length = len(str(len(exp_data))) trial_index = ["%s_%s" % (exp_id,str(x).zfill(zfill_length)) for x in range(len(exp_data))] df.index = trial_index if clean == True: df = clean_data(df, row['experiment_exp_id'], apply_post, drop_columns) return df
def extract_row(row, clean = True, apply_post = True, drop_columns = None): '''Returns a dataframe that has expanded the data of one row of a results object :row: one row of a Results data dataframe :param clean: boolean, if true call clean_df on the data :param drop_columns: list of columns to pass to clean_df :param drop_na: boolean to pass to clean_df :return df: dataframe containing the extracted experiment ''' exp_id = row['experiment_exp_id'] if row.get('process_stage') == 'post': df = pandas.DataFrame(row['data']) df['sort']=[int(i.split('_')[-1]) for i in df.index] df.sort_values(by = 'sort',inplace = True) df.drop('sort',axis = 1, inplace = True) if clean == True: df = clean_data(df, row['experiment_exp_id'], False, drop_columns) else: exp_data = get_data(row) for trial in exp_data: trial['battery_name'] = row['battery_name'] trial['experiment_exp_id'] = row['experiment_exp_id'] trial['worker_id'] = row['worker_id'] trial['finishtime'] = row['finishtime'] df = pandas.DataFrame(exp_data) trial_index = ["%s_%s" % (exp_id,x) for x in range(len(exp_data))] df.index = trial_index if clean == True: df = clean_data(df, row['experiment_exp_id'], apply_post, drop_columns) return df
def extract_row(row, clean=True, apply_post=True, drop_columns=None): '''Returns a dataframe that has expanded the data of one row of a results object :row: one row of a Results data dataframe :param clean: boolean, if true call clean_df on the data :param drop_columns: list of columns to pass to clean_df :param drop_na: boolean to pass to clean_df :return df: dataframe containing the extracted experiment ''' exp_id = row['experiment_exp_id'] if row.get('process_stage') == 'post': df = pandas.DataFrame(row['data']) df['sort'] = [int(i.split('_')[-1]) for i in df.index] df.sort_values(by='sort', inplace=True) df.drop('sort', axis=1, inplace=True) if clean == True: df = clean_data(df, row['experiment_exp_id'], False, drop_columns) else: exp_data = get_data(row) for trial in exp_data: trial['battery_name'] = row['battery_name'] trial['experiment_exp_id'] = row['experiment_exp_id'] trial['worker_id'] = row['worker_id'] trial['finishtime'] = row['finishtime'] df = pandas.DataFrame(exp_data) trial_index = ["%s_%s" % (exp_id, x) for x in range(len(exp_data))] df.index = trial_index if clean == True: df = clean_data(df, row['experiment_exp_id'], apply_post, drop_columns) return df
def calc_time_taken(data): '''Selects a worker (or workers) from results object and sorts based on experiment and time of experiment completion ''' instruction_lengths = [] exp_lengths = [] for i, row in data.iterrows(): if row['experiment_template'] == 'jspsych': exp_data = get_data(row) #ensure there is a time elapsed variable assert 'time_elapsed' in list(exp_data[-1].keys()), \ '"time_elapsed" not found for at least one dataset in these results' #sum time taken on instruction trials instruction_length = numpy.sum([ trial['time_elapsed'] for trial in exp_data if lookup_val(trial.get('trial_id')) == 'instruction' ]) #Set the length of the experiment to the time elapsed on the last #jsPsych trial experiment_length = exp_data[-1]['time_elapsed'] instruction_lengths.append(instruction_length / 1000.0) exp_lengths.append(experiment_length / 1000.0) else: instruction_lengths.append(numpy.nan) exp_lengths.append(numpy.nan) data.loc[:, 'total_time'] = exp_lengths data.loc[:, 'instruct_time'] = instruction_lengths data.loc[:, 'ontask_time'] = data['total_time'] - data['instruct_time']
def get_post_task_responses(data): question_responses = [numpy.nan] * len(data) for i, row in data.iterrows(): row_data = get_data(row) if row['experiment_template'] == 'jspsych': if row_data[-2].get('trial_id') =='post task questions' and \ 'responses' in list(row_data[-2].keys()): question_responses[i] = (row_data[-2]['responses']) data.loc[:, 'post_task_responses'] = question_responses
def extract_experiment(data, exp_id, clean = True, apply_post = True, drop_columns = None, return_reject = False, clean_fun = clean_data): '''Returns a dataframe that has expanded the data column of the results object for the specified experiment. Each row of this new dataframe is a data row for the specified experiment. :data: the data from an expanalysis Result object :experiment: a string identifying one experiment :param clean: boolean, if true call clean_df on the data :param drop_columns: list of columns to pass to clean_df :param return_reject: bool, default false. If true returns a dataframe with rejected experiments :param clean_fun: an alternative "clean" function. Must return a dataframe of the cleaned data :return df: dataframe containing the extracted experiment ''' trial_index = [] df = select_experiment(data, exp_id) if 'flagged' in df.columns: df_reject = df.query('flagged == True') df = df.query('flagged == False') if len(df) == 0: print('All %s datasets were flagged') return df,df_reject #ensure there is only one dataset for each battery/experiment/worker combination assert sum(df.groupby(['battery_name', 'experiment_exp_id', 'worker_id']).size()>1)==0, \ "More than one dataset found for at least one battery/experiment/worker combination" if numpy.unique(df.get('process_stage'))=='post': group_df = pandas.DataFrame() for i,row in df.iterrows(): tmp_df = extract_row(row, clean, False, drop_columns) group_df = pandas.concat([group_df, tmp_df ]) insert_i = tmp_df.index[0].rfind('_') trial_index += [x[:insert_i] + '_%s' % i + x[insert_i:] for x in tmp_df.index] df = group_df df.index = trial_index #sort_df df['sort']=[(int(i.split('_')[-2]),int(i.split('_')[-1])) for i in df.index] df.sort_values(by = 'sort',inplace = True) df.drop('sort',axis = 1, inplace = True) else: trial_list = [] for i,row in df.iterrows(): exp_data = get_data(row) for trial in exp_data: trial['battery_name'] = row['battery_name'] trial['experiment_exp_id'] = row['experiment_exp_id'] trial['worker_id'] = row['worker_id'] trial['finishtime'] = row['finishtime'] trial_list += exp_data trial_index += ["%s_%s_%s" % (exp_id,i,x) for x in range(len(exp_data))] df = pandas.DataFrame(trial_list) df.index = trial_index if clean == True: df = clean_fun(df, exp_id, apply_post, drop_columns) if return_reject: return df, df_reject else: return df
def get_post_task_responses(data): question_responses = [numpy.nan] * len(data) for i,row in data.iterrows(): row_data = get_data(row) if row['experiment_template'] == 'jspsych': if row_data[-2].get('trial_id') =='post task questions' and \ 'responses' in list(row_data[-2].keys()): question_responses[i]= (row_data[-2]['responses']) data.loc[:,'post_task_responses'] = question_responses
def extract_row(row, clean=True, apply_post=True, drop_columns=None): '''Returns a dataframe that has expanded the data of one row of a results object :row: one row of a Results data dataframe :param clean: boolean, if true call clean_df on the data :param drop_columns: list of columns to pass to clean_df :param drop_na: boolean to pass to clean_df :return df: dataframe containing the extracted experiment ''' exp_id = row['experiment_exp_id'] if row.get('process_stage') == 'post': zfill_length = len(str(len(row['data']['index']))) df = pandas.DataFrame(row['data']['trialdata']) df.columns = row['data']['columns'] df.index = [ '_'.join(t) + '_' + n.zfill(zfill_length) for *t, n in [i.split('_') for i in row['data']['index']] ] df.sort_index(inplace=True) if clean == True: df = clean_data(df, row['experiment_exp_id'], False, drop_columns) else: exp_data = get_data(row) for trial in exp_data: trial['battery_name'] = row['battery_name'] trial['experiment_exp_id'] = row['experiment_exp_id'] trial['worker_id'] = row['worker_id'] trial['finishtime'] = row['finishtime'] df = pandas.DataFrame(exp_data) zfill_length = len(str(len(exp_data))) trial_index = [ "%s_%s" % (exp_id, str(x).zfill(zfill_length)) for x in range(len(exp_data)) ] df.index = trial_index if clean == True: df = clean_data(df, row['experiment_exp_id'], apply_post, drop_columns) return df
def calc_time_taken(data): '''Selects a worker (or workers) from results object and sorts based on experiment and time of experiment completion ''' instruction_lengths = [] exp_lengths = [] for i,row in data.iterrows(): if row['experiment_template'] == 'jspsych': exp_data = get_data(row) #ensure there is a time elapsed variable assert 'time_elapsed' in list(exp_data[-1].keys()), \ '"time_elapsed" not found for at least one dataset in these results' #sum time taken on instruction trials instruction_length = numpy.sum([trial['time_elapsed'] for trial in exp_data if lookup_val(trial.get('trial_id')) == 'instruction']) #Set the length of the experiment to the time elapsed on the last #jsPsych trial experiment_length = exp_data[-1]['time_elapsed'] instruction_lengths.append(instruction_length/1000.0) exp_lengths.append(experiment_length/1000.0) else: instruction_lengths.append(numpy.nan) exp_lengths.append(numpy.nan) data.loc[:,'total_time'] = exp_lengths data.loc[:,'instruct_time'] = instruction_lengths data.loc[:,'ontask_time'] = data['total_time'] - data['instruct_time']
def extract_experiment(data, exp_id, clean=True, apply_post=True, drop_columns=None, return_reject=False, clean_fun=clean_data): '''Returns a dataframe that has expanded the data column of the results object for the specified experiment. Each row of this new dataframe is a data row for the specified experiment. :data: the data from an expanalysis Result object :experiment: a string identifying one experiment :param clean: boolean, if true call clean_df on the data :param drop_columns: list of columns to pass to clean_df :param return_reject: bool, default false. If true returns a dataframe with rejected experiments :param clean_fun: an alternative "clean" function. Must return a dataframe of the cleaned data :return df: dataframe containing the extracted experiment ''' trial_index = [] df = select_experiment(data, exp_id) if 'flagged' in df.columns: df_reject = df.query('flagged == True') df = df.query('flagged == False') if len(df) == 0: print('All %s datasets were flagged') return df, df_reject #report if there is only one dataset for each battery/experiment/worker combination if sum( df.groupby(['battery_name', 'experiment_exp_id', 'worker_id' ]).size() > 1) != 0: print( "More than one dataset found for at least one battery/worker/%s combination" % exp_id) if numpy.unique(df.get('process_stage')) == 'post': group_df = pandas.DataFrame() for i, row in df.iterrows(): tmp_df = extract_row(row, clean, False, drop_columns) group_df = pandas.concat([group_df, tmp_df]) insert_i = tmp_df.index[0].rfind('_') trial_index += [ x[:insert_i] + '_s%s' % str(i).zfill(3) + x[insert_i:] for x in tmp_df.index ] df = group_df df.index = trial_index df.sort_index(inplace=True) else: trial_list = [] for i, row in df.iterrows(): exp_data = get_data(row) for trial in exp_data: trial['battery_name'] = row['battery_name'] trial['experiment_exp_id'] = row['experiment_exp_id'] trial['worker_id'] = row['worker_id'] trial['finishtime'] = row['finishtime'] trial_list += exp_data trial_index += [ "%s_%s_%s" % (exp_id, str(i).zfill(3), str(x).zfill(3)) for x in range(len(exp_data)) ] df = pandas.DataFrame(trial_list) df.index = trial_index if clean == True: df = clean_fun(df, exp_id, apply_post, drop_columns) if return_reject: return df, df_reject else: return df