예제 #1
0
def prepare_public_data(dest):
    if not os.path.exists(dest):
        os.makedirs(dest)
    answers = pa.get_raw_data('answers', load_data, 'experiment_cache', answer_limit=1)[[
        'id', 'time', 'response_time', 'item_answered_id', 'item_asked_id', 'user_id',
        'guess', 'metainfo_id', 'direction', 'experiment_setup_name', 'context_name', 'session_id'
    ]]
    flashcards = pandas.read_csv('./flashcards.csv', index_col=False)
    feedback = pandas.read_csv('./ratings.csv', index_col=False)
    session_ip = pandas.read_csv('./ip_address.csv', index_col=False)[['sesion_id', 'ip_address']]

    session_ip['session_id'] = session_ip['sesion_id']
    session_ip = session_ip[['session_id', 'ip_address']]
    session_ip['ip_country'] = session_ip['ip_address'].apply(get_country)
    ips = session_ip['ip_address'].unique()
    ip_ids = dict(zip(ips, range(1, len(ips) + 1)))
    ip_ids[numpy.nan] = numpy.nan
    session_ip['ip_id'] = session_ip['ip_address'].apply(lambda i: ip_ids[i])
    session_ip = session_ip[['session_id', 'ip_country', 'ip_id']]
    answers = pandas.merge(answers, session_ip, on='session_id', how='inner')

    answers['options'] = answers['guess'].apply(lambda g: 0 if g == 0 else int(round(1 / g)))
    answers['reference'] = answers['metainfo_id'] == 1
    answers['condition'] = answers['experiment_setup_name']
    answers = pandas.merge(answers, flashcards[['item_id', 'term_name']], left_on='item_asked_id', right_on='item_id', how='inner')
    answers['term_asked_name'] = answers['term_name']
    del answers['term_name']
    answers = pandas.merge(answers, flashcards[['item_id', 'term_name']], left_on='item_answered_id', right_on='item_id', how='inner')
    answers['term_answered_name'] = answers['term_name']
    del answers['term_name']

    term_type_trans = {
        'region_cz': 'region',
        'bundesland': 'region',
        'region_it': 'region',
        'autonomous_Comunity': 'region',
        'province': 'region'
    }

    term_type_dict = flashcards.set_index('item_id')['term_type'].to_dict()
    us_state_items = flashcards[(flashcards['context_name'] == 'United States') & (flashcards['term_type'] == 'state')]['item_id'].unique()
    flashcards['term_type'] = flashcards['item_id'].apply(
        lambda i: term_type_trans.get(term_type_dict[i], term_type_dict[i]) if i not in us_state_items else 'region'
    )
    answers = pandas.merge(answers, flashcards[['item_id', 'term_type']], left_on='item_asked_id', right_on='item_id', how='inner')

    answers = answers[[
        'id', 'time', 'response_time', 'item_answered_id', 'item_asked_id', 'user_id',
        'options', 'reference', 'direction', 'condition', 'context_name',
        'term_type', 'term_asked_name', 'term_answered_name', 'ip_country', 'ip_id'
        ]]
    answers.to_csv('{}/answers.csv'.format(dest), index=False)

    feedback = feedback[feedback['user_id'].isin(answers['user_id'].unique())]
    feedback_values = {1: 'easy', 2: 'appropriate', 3: 'difficult'}
    feedback['value'] = feedback['value'].apply(lambda f: feedback_values[f])
    feedback.to_csv('{}/feedback.csv'.format(dest), index=False)
def compute_experiment_data(term_type=None, term_name=None, context_name=None, answer_limit=10, curve_length=5, progress_length=60, filter_invalid_tests=True, with_confidence=False, keys=None, contexts=False, filter_invalid_response_time=True, school=None, bootstrap_samples=100, density_length=100):
    compute_rolling_success = keys is None or len(set(keys) & {'output_rolling_success', 'stay_on_rolling_success'}) != 0
    data = pa.get_raw_data('answers',  datalib.load_data, 'experiment_cache',
        answer_limit=answer_limit, filter_invalid_tests=filter_invalid_tests,
        filter_invalid_response_time=filter_invalid_response_time, rolling_success=compute_rolling_success
    )
    if term_type is not None:
        if not isinstance(term_type, list):
            term_type = [term_type]
        data = data[data['term_type'].isin(term_type)]
    if term_name is not None:
        if not isinstance(term_name, list):
            term_name = [term_name]
        data = data[data['term_name'].isin(term_name)]
    if context_name is not None:
        if not isinstance(context_name, list):
            context_name = [context_name]
        data = data[data['context_name'].isin(context_name)]
    if school is not None:
        data = data[data['school'] == school]

    def _group_experiment_data(data_all, extended=False):
        data = data_all[data_all['metainfo_id'] == 1]
        groupped = data.groupby('experiment_setup_name')
        groupped_all = data_all.groupby('experiment_setup_name')
        result = {
            'meta': groupped_all.apply(meta).to_dict(),
            'meta_all': meta(data_all),
        }
        if keys is None or 'learning_points' in keys:
            result['learning_points'] = groupped.apply(lambda g: learning_points(g, length=curve_length)).to_dict()
        if keys is None or 'error_hist' in keys:
            result['error_hist'] = groupped_all.apply(
                lambda group: _histogram(group.groupby('user_id').apply(lambda g: (g['item_asked_id'] != g['item_answered_id']).mean()).to_dict().values(), bins=numpy.arange(0, 1.1, step=0.1))
            ).to_dict()
        if keys is None or 'learning_points_all' in keys:
            result['learning_points_all'] = learning_points(data, length=curve_length)
        if keys is None or 'learning_curve_all' in keys:
            result['learning_curve_all'] = groupped.apply(lambda g: learning_curve(g, length=curve_length)).to_dict()
        if keys is None or 'learning_curve_all_reverse' in keys:
            result['learning_curve_all_reverse'] = groupped.apply(lambda g: learning_curve(g, length=curve_length, reverse=True)).to_dict()
        if keys is None or 'learning_curve' in keys:
            result['learning_curve'] = groupped.apply(lambda g: learning_curve(g, length=curve_length, user_length=curve_length)).to_dict()
        if keys is None or 'learning_curve_reverse' in keys:
            result['learning_curve_reverse'] = groupped.apply(lambda g: learning_curve(g, length=curve_length, user_length=curve_length, reverse=True)).to_dict()
        if keys is None or 'response_time_curve_correct_all' in keys:
            result['response_time_curve_correct_all'] = groupped.apply(lambda g: response_time_curve(g, length=curve_length)).to_dict()
        if keys is None or 'response_time_curve_correct' in keys:
            result['response_time_curve_correct'] = groupped.apply(lambda g: response_time_curve(g, length=curve_length, user_length=curve_length)).to_dict()
        if keys is None or 'response_time_curve_wrong_all' in keys:
            result['response_time_curve_wrong_all'] = groupped.apply(lambda g: response_time_curve(g, length=curve_length, correctness=False)).to_dict()
        if keys is None or 'response_time_curve_wrong' in keys:
            result['response_time_curve_wrong'] = groupped.apply(lambda g: response_time_curve(g, length=curve_length, user_length=curve_length, correctness=False)).to_dict()
        if keys is None or 'test_questions_hist' in keys:
            result['test_questions_hist'] = groupped_all.apply(lambda g: test_questions(g, length=progress_length)).to_dict()
        if keys is None or 'attrition_bias' in keys:
            result['attrition_bias'] = groupped.apply(lambda g: attrition_bias(g, curve_length)).to_dict()
        if keys is None or 'learning_curve_fit_all' in keys:
            result['learning_curve_fit_all'] = groupped.apply(lambda g: fit_learning_curve(g, length=curve_length, bootstrap_samples=bootstrap_samples)).to_dict()
        if keys is None or 'learning_curve_fit' in keys:
            result['learning_curve_fit'] = groupped.apply(lambda g: fit_learning_curve(g, length=curve_length, user_length=curve_length, bootstrap_samples=bootstrap_samples)).to_dict()

        if extended:
            if keys is None or 'response_time_curve_global' in keys:
                result['response_time_curve_global'] = groupped_all.apply(lambda g: response_time_curve(g, length=progress_length, correctness=None)).to_dict()
            if keys is None or 'learning_curve_global' in keys:
                result['learning_curve_global'] = groupped_all.apply(lambda g: learning_curve(g, length=progress_length)).to_dict()
            if keys is None or 'learning_curve_fit_reverse' in keys:
                result['learning_curve_fit_reverse'] = groupped.apply(lambda g: fit_learning_curve(g, length=curve_length, user_length=curve_length, reverse=True, bootstrap_samples=bootstrap_samples)).to_dict()
            if keys is None or 'progress' in keys:
                result['progress'] = groupped_all.apply(lambda g: progress(g, length=progress_length)).to_dict()
            if keys is None or 'progress_milestones' in keys:
                result['progress_milestones'] = groupped_all.apply(lambda g: milestone_progress(g, length=progress_length / 10, with_confidence=with_confidence)).to_dict()
            if keys is None or 'returning' in keys:
                result['returning'] = groupped_all.apply(lambda g: returning(g)).to_dict()
            if keys is None or 'stay_on_rolling_success' in keys:
                result['stay_on_rolling_success'] = groupped_all.apply(lambda g: stay_on_rolling_success(g)).to_dict()
            if keys is None or 'output_rolling_success' in keys:
                result['output_rolling_success'] = groupped_all.apply(lambda g: output_rolling_success(g)).to_dict()

            if keys is None or 'weibull' in keys:
                result['weibull'] = groupped_all.apply(lambda g: fit_weibull(g, density_length)).to_dict()
            if keys is None or 'answers_density' in keys:
                result['answers_density'] = groupped_all.apply(lambda g: answers_density(g, density_length)).to_dict()
        return result
    result = {
        'all': _group_experiment_data(data, extended=True),
    }
    if contexts:
        result['contexts'] = {
            '{}, {}'.format(context_name, term_type): value
            for ((context_name, term_type), value) in data.groupby(['context_name', 'term_type']).apply(_group_experiment_data).to_dict().iteritems()
        }
    return result