示例#1
0
def extract_row(row, clean = True, apply_post = True, drop_columns = None):
    '''Returns a dataframe that has expanded the data of one row of a results object
    :row:  one row of a Results data dataframe
    :param clean: boolean, if true call clean_df on the data
    :param drop_columns: list of columns to pass to clean_df
    :param drop_na: boolean to pass to clean_df
    :return df: dataframe containing the extracted experiment
    '''
    exp_id = row['experiment_exp_id']
    if row.get('process_stage') == 'post':
        zfill_length = len(str(len(row['data']['index'])))
        df = pandas.DataFrame(row['data']['trialdata'])
        df.columns = row['data']['columns']
        df.index = ['_'.join(t)+'_'+n.zfill(zfill_length) 
                    for *t,n in [i.split('_') for i in row['data']['index']]] 
        df.sort_index(inplace = True)
        if clean == True:
            df = clean_data(df, row['experiment_exp_id'], False, drop_columns)
    else:
        exp_data = get_data(row)
        for trial in exp_data:
            trial['battery_name'] = row['battery_name']
            trial['experiment_exp_id'] = row['experiment_exp_id']
            trial['worker_id'] = row['worker_id']
            trial['finishtime'] = row['finishtime']
        df = pandas.DataFrame(exp_data)
        zfill_length = len(str(len(exp_data)))
        trial_index = ["%s_%s" % (exp_id,str(x).zfill(zfill_length)) 
                        for x in range(len(exp_data))]
        df.index = trial_index
        if clean == True:
            df = clean_data(df, row['experiment_exp_id'], apply_post, drop_columns)
    return df  
def extract_row(row, clean = True, apply_post = True, drop_columns = None):
    '''Returns a dataframe that has expanded the data of one row of a results object
    :row:  one row of a Results data dataframe
    :param clean: boolean, if true call clean_df on the data
    :param drop_columns: list of columns to pass to clean_df
    :param drop_na: boolean to pass to clean_df
    :return df: dataframe containing the extracted experiment
    '''
    exp_id = row['experiment_exp_id']
    if row.get('process_stage') == 'post':
        df = pandas.DataFrame(row['data'])
        df['sort']=[int(i.split('_')[-1]) for i in df.index]
        df.sort_values(by = 'sort',inplace = True)
        df.drop('sort',axis = 1, inplace = True)
        if clean == True:
            df = clean_data(df, row['experiment_exp_id'], False, drop_columns)
    else:
        exp_data = get_data(row)
        for trial in exp_data:
            trial['battery_name'] = row['battery_name']
            trial['experiment_exp_id'] = row['experiment_exp_id']
            trial['worker_id'] = row['worker_id']
            trial['finishtime'] = row['finishtime']
        df = pandas.DataFrame(exp_data)
        trial_index = ["%s_%s" % (exp_id,x) for x in range(len(exp_data))]
        df.index = trial_index
        if clean == True:
            df = clean_data(df, row['experiment_exp_id'], apply_post, drop_columns)
    return df  
def extract_row(row, clean=True, apply_post=True, drop_columns=None):
    '''Returns a dataframe that has expanded the data of one row of a results object
    :row:  one row of a Results data dataframe
    :param clean: boolean, if true call clean_df on the data
    :param drop_columns: list of columns to pass to clean_df
    :param drop_na: boolean to pass to clean_df
    :return df: dataframe containing the extracted experiment
    '''
    exp_id = row['experiment_exp_id']
    if row.get('process_stage') == 'post':
        df = pandas.DataFrame(row['data'])
        df['sort'] = [int(i.split('_')[-1]) for i in df.index]
        df.sort_values(by='sort', inplace=True)
        df.drop('sort', axis=1, inplace=True)
        if clean == True:
            df = clean_data(df, row['experiment_exp_id'], False, drop_columns)
    else:
        exp_data = get_data(row)
        for trial in exp_data:
            trial['battery_name'] = row['battery_name']
            trial['experiment_exp_id'] = row['experiment_exp_id']
            trial['worker_id'] = row['worker_id']
            trial['finishtime'] = row['finishtime']
        df = pandas.DataFrame(exp_data)
        trial_index = ["%s_%s" % (exp_id, x) for x in range(len(exp_data))]
        df.index = trial_index
        if clean == True:
            df = clean_data(df, row['experiment_exp_id'], apply_post,
                            drop_columns)
    return df
示例#4
0
def calc_time_taken(data):
    '''Selects a worker (or workers) from results object and sorts based on experiment and time of experiment completion
    '''
    instruction_lengths = []
    exp_lengths = []
    for i, row in data.iterrows():
        if row['experiment_template'] == 'jspsych':
            exp_data = get_data(row)
            #ensure there is a time elapsed variable
            assert 'time_elapsed' in list(exp_data[-1].keys()), \
                '"time_elapsed" not found for at least one dataset in these results'
            #sum time taken on instruction trials
            instruction_length = numpy.sum([
                trial['time_elapsed'] for trial in exp_data
                if lookup_val(trial.get('trial_id')) == 'instruction'
            ])
            #Set the length of the experiment to the time elapsed on the last
            #jsPsych trial
            experiment_length = exp_data[-1]['time_elapsed']
            instruction_lengths.append(instruction_length / 1000.0)
            exp_lengths.append(experiment_length / 1000.0)
        else:
            instruction_lengths.append(numpy.nan)
            exp_lengths.append(numpy.nan)
    data.loc[:, 'total_time'] = exp_lengths
    data.loc[:, 'instruct_time'] = instruction_lengths
    data.loc[:, 'ontask_time'] = data['total_time'] - data['instruct_time']
示例#5
0
def get_post_task_responses(data):
    question_responses = [numpy.nan] * len(data)
    for i, row in data.iterrows():
        row_data = get_data(row)
        if row['experiment_template'] == 'jspsych':
            if row_data[-2].get('trial_id') =='post task questions' and \
                'responses' in list(row_data[-2].keys()):
                question_responses[i] = (row_data[-2]['responses'])
    data.loc[:, 'post_task_responses'] = question_responses
def extract_experiment(data, exp_id, clean = True, apply_post = True, drop_columns = None, return_reject = False, clean_fun = clean_data):
    '''Returns a dataframe that has expanded the data column of the results object for the specified experiment.
    Each row of this new dataframe is a data row for the specified experiment.
    :data: the data from an expanalysis Result object
    :experiment: a string identifying one experiment
    :param clean: boolean, if true call clean_df on the data
    :param drop_columns: list of columns to pass to clean_df
    :param return_reject: bool, default false. If true returns a dataframe with rejected experiments
    :param clean_fun: an alternative "clean" function. Must return a dataframe of the cleaned data
    :return df: dataframe containing the extracted experiment
    '''
    trial_index = []
    df = select_experiment(data, exp_id)
    if 'flagged' in df.columns:
        df_reject = df.query('flagged == True')
        df = df.query('flagged == False')
        if len(df) == 0:
            print('All %s datasets were flagged')
            return df,df_reject
    #ensure there is only one dataset for each battery/experiment/worker combination
    assert sum(df.groupby(['battery_name', 'experiment_exp_id', 'worker_id']).size()>1)==0, \
        "More than one dataset found for at least one battery/experiment/worker combination"
    if numpy.unique(df.get('process_stage'))=='post':
        group_df = pandas.DataFrame()
        for i,row in df.iterrows():
            tmp_df = extract_row(row, clean, False, drop_columns)
            group_df = pandas.concat([group_df, tmp_df ])
            insert_i = tmp_df.index[0].rfind('_')
            trial_index += [x[:insert_i] + '_%s' % i + x[insert_i:] for x in tmp_df.index]
        df = group_df
        df.index = trial_index
        #sort_df
        df['sort']=[(int(i.split('_')[-2]),int(i.split('_')[-1])) for i in df.index]
        df.sort_values(by = 'sort',inplace = True)
        df.drop('sort',axis = 1, inplace = True)
    else:
        trial_list = []
        for i,row in df.iterrows():
            exp_data = get_data(row)
            for trial in exp_data:
                trial['battery_name'] = row['battery_name']
                trial['experiment_exp_id'] = row['experiment_exp_id']
                trial['worker_id'] = row['worker_id']
                trial['finishtime'] = row['finishtime']
            trial_list += exp_data
            trial_index += ["%s_%s_%s" % (exp_id,i,x) for x in range(len(exp_data))]
        df = pandas.DataFrame(trial_list)
        df.index = trial_index
        if clean == True:
            df = clean_fun(df, exp_id, apply_post, drop_columns)
    if return_reject:
        return df, df_reject
    else:
        return df
def get_post_task_responses(data):
    question_responses = [numpy.nan] * len(data)
    for i,row in data.iterrows():
        row_data = get_data(row)
        if row['experiment_template'] == 'jspsych':
            if row_data[-2].get('trial_id') =='post task questions' and \
                'responses' in list(row_data[-2].keys()):
                question_responses[i]= (row_data[-2]['responses'])
    data.loc[:,'post_task_responses'] = question_responses

    


    
    
    
示例#8
0
def extract_row(row, clean=True, apply_post=True, drop_columns=None):
    '''Returns a dataframe that has expanded the data of one row of a results object
    :row:  one row of a Results data dataframe
    :param clean: boolean, if true call clean_df on the data
    :param drop_columns: list of columns to pass to clean_df
    :param drop_na: boolean to pass to clean_df
    :return df: dataframe containing the extracted experiment
    '''
    exp_id = row['experiment_exp_id']
    if row.get('process_stage') == 'post':
        zfill_length = len(str(len(row['data']['index'])))
        df = pandas.DataFrame(row['data']['trialdata'])
        df.columns = row['data']['columns']
        df.index = [
            '_'.join(t) + '_' + n.zfill(zfill_length)
            for *t, n in [i.split('_') for i in row['data']['index']]
        ]
        df.sort_index(inplace=True)
        if clean == True:
            df = clean_data(df, row['experiment_exp_id'], False, drop_columns)
    else:
        exp_data = get_data(row)
        for trial in exp_data:
            trial['battery_name'] = row['battery_name']
            trial['experiment_exp_id'] = row['experiment_exp_id']
            trial['worker_id'] = row['worker_id']
            trial['finishtime'] = row['finishtime']
        df = pandas.DataFrame(exp_data)
        zfill_length = len(str(len(exp_data)))
        trial_index = [
            "%s_%s" % (exp_id, str(x).zfill(zfill_length))
            for x in range(len(exp_data))
        ]
        df.index = trial_index
        if clean == True:
            df = clean_data(df, row['experiment_exp_id'], apply_post,
                            drop_columns)
    return df
def calc_time_taken(data):
    '''Selects a worker (or workers) from results object and sorts based on experiment and time of experiment completion
    '''
    instruction_lengths = []
    exp_lengths = []
    for i,row in data.iterrows():
        if row['experiment_template'] == 'jspsych':
            exp_data = get_data(row)
            #ensure there is a time elapsed variable
            assert 'time_elapsed' in list(exp_data[-1].keys()), \
                '"time_elapsed" not found for at least one dataset in these results'
            #sum time taken on instruction trials
            instruction_length = numpy.sum([trial['time_elapsed'] for trial in exp_data if lookup_val(trial.get('trial_id')) == 'instruction'])        
            #Set the length of the experiment to the time elapsed on the last 
            #jsPsych trial
            experiment_length = exp_data[-1]['time_elapsed']
            instruction_lengths.append(instruction_length/1000.0)
            exp_lengths.append(experiment_length/1000.0)
        else:
            instruction_lengths.append(numpy.nan)
            exp_lengths.append(numpy.nan)
    data.loc[:,'total_time'] = exp_lengths
    data.loc[:,'instruct_time'] = instruction_lengths
    data.loc[:,'ontask_time'] = data['total_time'] - data['instruct_time']
示例#10
0
def extract_experiment(data,
                       exp_id,
                       clean=True,
                       apply_post=True,
                       drop_columns=None,
                       return_reject=False,
                       clean_fun=clean_data):
    '''Returns a dataframe that has expanded the data column of the results object for the specified experiment.
    Each row of this new dataframe is a data row for the specified experiment.
    :data: the data from an expanalysis Result object
    :experiment: a string identifying one experiment
    :param clean: boolean, if true call clean_df on the data
    :param drop_columns: list of columns to pass to clean_df
    :param return_reject: bool, default false. If true returns a dataframe with rejected experiments
    :param clean_fun: an alternative "clean" function. Must return a dataframe of the cleaned data
    :return df: dataframe containing the extracted experiment
    '''
    trial_index = []
    df = select_experiment(data, exp_id)
    if 'flagged' in df.columns:
        df_reject = df.query('flagged == True')
        df = df.query('flagged == False')
        if len(df) == 0:
            print('All %s datasets were flagged')
            return df, df_reject
    #report if there is only one dataset for each battery/experiment/worker combination
    if sum(
            df.groupby(['battery_name', 'experiment_exp_id', 'worker_id'
                        ]).size() > 1) != 0:
        print(
            "More than one dataset found for at least one battery/worker/%s combination"
            % exp_id)
    if numpy.unique(df.get('process_stage')) == 'post':
        group_df = pandas.DataFrame()
        for i, row in df.iterrows():
            tmp_df = extract_row(row, clean, False, drop_columns)
            group_df = pandas.concat([group_df, tmp_df])
            insert_i = tmp_df.index[0].rfind('_')
            trial_index += [
                x[:insert_i] + '_s%s' % str(i).zfill(3) + x[insert_i:]
                for x in tmp_df.index
            ]
        df = group_df
        df.index = trial_index
        df.sort_index(inplace=True)
    else:
        trial_list = []
        for i, row in df.iterrows():
            exp_data = get_data(row)
            for trial in exp_data:
                trial['battery_name'] = row['battery_name']
                trial['experiment_exp_id'] = row['experiment_exp_id']
                trial['worker_id'] = row['worker_id']
                trial['finishtime'] = row['finishtime']
            trial_list += exp_data
            trial_index += [
                "%s_%s_%s" % (exp_id, str(i).zfill(3), str(x).zfill(3))
                for x in range(len(exp_data))
            ]
        df = pandas.DataFrame(trial_list)
        df.index = trial_index
        if clean == True:
            df = clean_fun(df, exp_id, apply_post, drop_columns)
    if return_reject:
        return df, df_reject
    else:
        return df