def prepare_public_data(dest): if not os.path.exists(dest): os.makedirs(dest) answers = pa.get_raw_data('answers', load_data, 'experiment_cache', answer_limit=1)[[ 'id', 'time', 'response_time', 'item_answered_id', 'item_asked_id', 'user_id', 'guess', 'metainfo_id', 'direction', 'experiment_setup_name', 'context_name', 'session_id' ]] flashcards = pandas.read_csv('./flashcards.csv', index_col=False) feedback = pandas.read_csv('./ratings.csv', index_col=False) session_ip = pandas.read_csv('./ip_address.csv', index_col=False)[['sesion_id', 'ip_address']] session_ip['session_id'] = session_ip['sesion_id'] session_ip = session_ip[['session_id', 'ip_address']] session_ip['ip_country'] = session_ip['ip_address'].apply(get_country) ips = session_ip['ip_address'].unique() ip_ids = dict(zip(ips, range(1, len(ips) + 1))) ip_ids[numpy.nan] = numpy.nan session_ip['ip_id'] = session_ip['ip_address'].apply(lambda i: ip_ids[i]) session_ip = session_ip[['session_id', 'ip_country', 'ip_id']] answers = pandas.merge(answers, session_ip, on='session_id', how='inner') answers['options'] = answers['guess'].apply(lambda g: 0 if g == 0 else int(round(1 / g))) answers['reference'] = answers['metainfo_id'] == 1 answers['condition'] = answers['experiment_setup_name'] answers = pandas.merge(answers, flashcards[['item_id', 'term_name']], left_on='item_asked_id', right_on='item_id', how='inner') answers['term_asked_name'] = answers['term_name'] del answers['term_name'] answers = pandas.merge(answers, flashcards[['item_id', 'term_name']], left_on='item_answered_id', right_on='item_id', how='inner') answers['term_answered_name'] = answers['term_name'] del answers['term_name'] term_type_trans = { 'region_cz': 'region', 'bundesland': 'region', 'region_it': 'region', 'autonomous_Comunity': 'region', 'province': 'region' } term_type_dict = flashcards.set_index('item_id')['term_type'].to_dict() us_state_items = flashcards[(flashcards['context_name'] == 'United States') & (flashcards['term_type'] == 'state')]['item_id'].unique() flashcards['term_type'] = flashcards['item_id'].apply( lambda i: term_type_trans.get(term_type_dict[i], term_type_dict[i]) if i not in us_state_items else 'region' ) answers = pandas.merge(answers, flashcards[['item_id', 'term_type']], left_on='item_asked_id', right_on='item_id', how='inner') answers = answers[[ 'id', 'time', 'response_time', 'item_answered_id', 'item_asked_id', 'user_id', 'options', 'reference', 'direction', 'condition', 'context_name', 'term_type', 'term_asked_name', 'term_answered_name', 'ip_country', 'ip_id' ]] answers.to_csv('{}/answers.csv'.format(dest), index=False) feedback = feedback[feedback['user_id'].isin(answers['user_id'].unique())] feedback_values = {1: 'easy', 2: 'appropriate', 3: 'difficult'} feedback['value'] = feedback['value'].apply(lambda f: feedback_values[f]) feedback.to_csv('{}/feedback.csv'.format(dest), index=False)
def compute_experiment_data(term_type=None, term_name=None, context_name=None, answer_limit=10, curve_length=5, progress_length=60, filter_invalid_tests=True, with_confidence=False, keys=None, contexts=False, filter_invalid_response_time=True, school=None, bootstrap_samples=100, density_length=100): compute_rolling_success = keys is None or len(set(keys) & {'output_rolling_success', 'stay_on_rolling_success'}) != 0 data = pa.get_raw_data('answers', datalib.load_data, 'experiment_cache', answer_limit=answer_limit, filter_invalid_tests=filter_invalid_tests, filter_invalid_response_time=filter_invalid_response_time, rolling_success=compute_rolling_success ) if term_type is not None: if not isinstance(term_type, list): term_type = [term_type] data = data[data['term_type'].isin(term_type)] if term_name is not None: if not isinstance(term_name, list): term_name = [term_name] data = data[data['term_name'].isin(term_name)] if context_name is not None: if not isinstance(context_name, list): context_name = [context_name] data = data[data['context_name'].isin(context_name)] if school is not None: data = data[data['school'] == school] def _group_experiment_data(data_all, extended=False): data = data_all[data_all['metainfo_id'] == 1] groupped = data.groupby('experiment_setup_name') groupped_all = data_all.groupby('experiment_setup_name') result = { 'meta': groupped_all.apply(meta).to_dict(), 'meta_all': meta(data_all), } if keys is None or 'learning_points' in keys: result['learning_points'] = groupped.apply(lambda g: learning_points(g, length=curve_length)).to_dict() if keys is None or 'error_hist' in keys: result['error_hist'] = groupped_all.apply( lambda group: _histogram(group.groupby('user_id').apply(lambda g: (g['item_asked_id'] != g['item_answered_id']).mean()).to_dict().values(), bins=numpy.arange(0, 1.1, step=0.1)) ).to_dict() if keys is None or 'learning_points_all' in keys: result['learning_points_all'] = learning_points(data, length=curve_length) if keys is None or 'learning_curve_all' in keys: result['learning_curve_all'] = groupped.apply(lambda g: learning_curve(g, length=curve_length)).to_dict() if keys is None or 'learning_curve_all_reverse' in keys: result['learning_curve_all_reverse'] = groupped.apply(lambda g: learning_curve(g, length=curve_length, reverse=True)).to_dict() if keys is None or 'learning_curve' in keys: result['learning_curve'] = groupped.apply(lambda g: learning_curve(g, length=curve_length, user_length=curve_length)).to_dict() if keys is None or 'learning_curve_reverse' in keys: result['learning_curve_reverse'] = groupped.apply(lambda g: learning_curve(g, length=curve_length, user_length=curve_length, reverse=True)).to_dict() if keys is None or 'response_time_curve_correct_all' in keys: result['response_time_curve_correct_all'] = groupped.apply(lambda g: response_time_curve(g, length=curve_length)).to_dict() if keys is None or 'response_time_curve_correct' in keys: result['response_time_curve_correct'] = groupped.apply(lambda g: response_time_curve(g, length=curve_length, user_length=curve_length)).to_dict() if keys is None or 'response_time_curve_wrong_all' in keys: result['response_time_curve_wrong_all'] = groupped.apply(lambda g: response_time_curve(g, length=curve_length, correctness=False)).to_dict() if keys is None or 'response_time_curve_wrong' in keys: result['response_time_curve_wrong'] = groupped.apply(lambda g: response_time_curve(g, length=curve_length, user_length=curve_length, correctness=False)).to_dict() if keys is None or 'test_questions_hist' in keys: result['test_questions_hist'] = groupped_all.apply(lambda g: test_questions(g, length=progress_length)).to_dict() if keys is None or 'attrition_bias' in keys: result['attrition_bias'] = groupped.apply(lambda g: attrition_bias(g, curve_length)).to_dict() if keys is None or 'learning_curve_fit_all' in keys: result['learning_curve_fit_all'] = groupped.apply(lambda g: fit_learning_curve(g, length=curve_length, bootstrap_samples=bootstrap_samples)).to_dict() if keys is None or 'learning_curve_fit' in keys: result['learning_curve_fit'] = groupped.apply(lambda g: fit_learning_curve(g, length=curve_length, user_length=curve_length, bootstrap_samples=bootstrap_samples)).to_dict() if extended: if keys is None or 'response_time_curve_global' in keys: result['response_time_curve_global'] = groupped_all.apply(lambda g: response_time_curve(g, length=progress_length, correctness=None)).to_dict() if keys is None or 'learning_curve_global' in keys: result['learning_curve_global'] = groupped_all.apply(lambda g: learning_curve(g, length=progress_length)).to_dict() if keys is None or 'learning_curve_fit_reverse' in keys: result['learning_curve_fit_reverse'] = groupped.apply(lambda g: fit_learning_curve(g, length=curve_length, user_length=curve_length, reverse=True, bootstrap_samples=bootstrap_samples)).to_dict() if keys is None or 'progress' in keys: result['progress'] = groupped_all.apply(lambda g: progress(g, length=progress_length)).to_dict() if keys is None or 'progress_milestones' in keys: result['progress_milestones'] = groupped_all.apply(lambda g: milestone_progress(g, length=progress_length / 10, with_confidence=with_confidence)).to_dict() if keys is None or 'returning' in keys: result['returning'] = groupped_all.apply(lambda g: returning(g)).to_dict() if keys is None or 'stay_on_rolling_success' in keys: result['stay_on_rolling_success'] = groupped_all.apply(lambda g: stay_on_rolling_success(g)).to_dict() if keys is None or 'output_rolling_success' in keys: result['output_rolling_success'] = groupped_all.apply(lambda g: output_rolling_success(g)).to_dict() if keys is None or 'weibull' in keys: result['weibull'] = groupped_all.apply(lambda g: fit_weibull(g, density_length)).to_dict() if keys is None or 'answers_density' in keys: result['answers_density'] = groupped_all.apply(lambda g: answers_density(g, density_length)).to_dict() return result result = { 'all': _group_experiment_data(data, extended=True), } if contexts: result['contexts'] = { '{}, {}'.format(context_name, term_type): value for ((context_name, term_type), value) in data.groupby(['context_name', 'term_type']).apply(_group_experiment_data).to_dict().iteritems() } return result