def test_group_ungroup_does_not_change_data(self): for data in test_dataframes: grouped = util.group_data(data, 'user_id') ungrouped = util.ungroup_data(grouped) self.assertTrue( data.equals(ungrouped), """ Grouped and ungrouped data: {} should be the same as original data: {} """.format(ungrouped, data))
def test_ungroup_group_does_not_change_data(self): for data in grouped_test_dataframes: ungrouped = util.ungroup_data(data) grouped = util.group_data(ungrouped, 'user_id') self.assertTrue( data.equals(grouped), """ Ungrouped and grouped data: {} should be the same as original data: {} """.format(grouped, data))
def to_asc(data, filepath, student_col, skill_col, correct_col): grouped_data = utils.group_data(data.applymap(str), student_col) with open(filepath, 'w') as f: for skill_ids, corrects in zip(grouped_data[skill_col], grouped_data[correct_col]): assert len(skill_ids) == len(corrects), "Skill id and correct sequence lengths do not match: {} != {}" \ .format(len(skill_ids), len(corrects)) n_attempts = len(skill_ids) f.write(str(n_attempts) + '\n') f.write(','.join(skill_ids) + '\n') f.write(','.join(corrects) + '\n')
def fiddle(args): if args is None: args = arg_conf.Args(arg_conf.converter_args_map) in_data = reader.read_kt_data(args.in_file, args.in_format) use_cols = [args.user_col, args.correct_col, args.skill_col] if args.exercise_col is not None: use_cols.append(args.exercise_col) if any([use_col not in in_data.columns for use_col in use_cols]): raise ValueError(""" Invalid columns provided: skill col: {} correct col: {} user col: {} exercise col: {} Found columns: {} """.format(args.skill_col, args.correct_col, args.user_col, args.exercise_col, ', '.join(in_data.columns.values))) if args.clean: print( 'Removing rows with nan values in columns {}, {} or {}...'.format( *use_cols)) print('Data rows before dropping nan rows: {}'.format(len(in_data))) out_data = in_data[use_cols].dropna() print('Data rows after dropping nan rows: {}'.format(len(out_data))) print( 'Categorizing skill_column and ensuring correctness is a binary variable...' ) out_data = util.clean_data(out_data, user_col=args.user_col, skill_col=args.skill_col, correct_col=args.correct_col) else: out_data = in_data if args.shuffle: grouped = util.group_data(out_data, args.user_col) out_data = util.ungroup_data( grouped.sample(frac=1).reset_index(drop=True)) show_stats(out_data, args.stat_format, args.user_col, args.exercise_col, args.skill_col, args.correct_col) save_data(args, out_data)
def show_stats(data, format, student_col, exercise_col, skill_col, correct_col): def round_to_k(x): return f'{int(round(x, -3) / 1000)}' usecols = [student_col, skill_col, correct_col] if exercise_col != skill_col and exercise_col is not None: usecols.append(exercise_col) grouped = util.group_data(data[usecols].dropna(), student_col) stats_dict = { 'Max attempts': grouped[student_col].apply(len).max(), 'Students': len(grouped), 'Records': round_to_k(len(data)), 'Correct count': round_to_k(data[correct_col].sum()), 'Exercise tags': len(data[exercise_col].unique()) if exercise_col is not None else len(data[skill_col].unique()), 'Skill tags': len(data[skill_col].unique()) if exercise_col is not None and skill_col != exercise_col else '-' } stats = pd.DataFrame({k: [v] for k, v in stats_dict.items()}) if format == 'json': stats_str = stats.to_json() elif format == 'txt': stats_str = stats.to_string(index=False) elif format == 'csv': stats_str = stats.to_csv(index=False) elif format == 'tex': stats_str = stats.to_latex(index=False) else: raise NotImplementedError( f'Statistics format {format} is not implemented.') print(stats_str) return stats_str
test_csv_filepath = os.path.join( test_data_dir, 'assistments2009-skill-builders-corrected_1000.csv') test_simple_csv_filepath = os.path.join(test_data_dir, 'simple.csv') test_asc_filepath = os.path.join(test_data_dir, 'test.asc') data1 = pd.DataFrame({ 'user_id': [1, 2, 2, 2, 3, 3], 'skill_id': [1, 1, 2, 1, 3, 4], 'correct': [0, 1, 0, 0, 1, 0] }) data2 = read_kt_data(test_csv_filepath, format='csv') test_dataframes = [data1, data2] grouped_data1 = pd.DataFrame({ 'user_id': [[1, 1], [2, 2, 2]], 'skill_id': [[1, 2], [3, 3, 4]], 'correct': [[1, 1], [0, 0, 0]] }) grouped_data2 = util.group_data( read_kt_data(test_asc_filepath, 'asc', 'user_id', 'skill_id', 'correct'), 'user_id') grouped_test_dataframes = [grouped_data1, grouped_data2] def dict_to_obj(my_dict, name='X'): return namedtuple(name.replace('-', '_'), (k.replace('-', '_') for k in my_dict.keys()))(*[ x if not isinstance(x, dict) else dict_to_obj(x) for x in my_dict.values() ])