예제 #1
0
    def test_group_ungroup_does_not_change_data(self):
        for data in test_dataframes:
            grouped = util.group_data(data, 'user_id')
            ungrouped = util.ungroup_data(grouped)
            self.assertTrue(
                data.equals(ungrouped), """
Grouped and ungrouped data:
{}
should be the same as original data:
{}
            """.format(ungrouped, data))
예제 #2
0
    def test_ungroup_group_does_not_change_data(self):
        for data in grouped_test_dataframes:
            ungrouped = util.ungroup_data(data)
            grouped = util.group_data(ungrouped, 'user_id')
            self.assertTrue(
                data.equals(grouped), """
Ungrouped and grouped data:
{}
should be the same as original data:
{}
            """.format(grouped, data))
예제 #3
0
def to_asc(data, filepath, student_col, skill_col, correct_col):
    grouped_data = utils.group_data(data.applymap(str), student_col)
    with open(filepath, 'w') as f:
        for skill_ids, corrects in zip(grouped_data[skill_col],
                                       grouped_data[correct_col]):
            assert len(skill_ids) == len(corrects), "Skill id and correct sequence lengths do not match: {} != {}" \
                .format(len(skill_ids), len(corrects))
            n_attempts = len(skill_ids)
            f.write(str(n_attempts) + '\n')
            f.write(','.join(skill_ids) + '\n')
            f.write(','.join(corrects) + '\n')
예제 #4
0
def fiddle(args):
    if args is None:
        args = arg_conf.Args(arg_conf.converter_args_map)

    in_data = reader.read_kt_data(args.in_file, args.in_format)

    use_cols = [args.user_col, args.correct_col, args.skill_col]
    if args.exercise_col is not None:
        use_cols.append(args.exercise_col)

    if any([use_col not in in_data.columns for use_col in use_cols]):
        raise ValueError("""
Invalid columns provided:
    skill col: {}
    correct col: {}
    user col: {}
    exercise col: {}
Found columns:
{}
""".format(args.skill_col, args.correct_col, args.user_col, args.exercise_col,
           ', '.join(in_data.columns.values)))

    if args.clean:
        print(
            'Removing rows with nan values in columns {}, {} or {}...'.format(
                *use_cols))
        print('Data rows before dropping nan rows: {}'.format(len(in_data)))
        out_data = in_data[use_cols].dropna()
        print('Data rows after dropping nan rows: {}'.format(len(out_data)))
        print(
            'Categorizing skill_column and ensuring correctness is a binary variable...'
        )
        out_data = util.clean_data(out_data,
                                   user_col=args.user_col,
                                   skill_col=args.skill_col,
                                   correct_col=args.correct_col)
    else:
        out_data = in_data

    if args.shuffle:
        grouped = util.group_data(out_data, args.user_col)
        out_data = util.ungroup_data(
            grouped.sample(frac=1).reset_index(drop=True))

    show_stats(out_data, args.stat_format, args.user_col, args.exercise_col,
               args.skill_col, args.correct_col)

    save_data(args, out_data)
예제 #5
0
def show_stats(data, format, student_col, exercise_col, skill_col,
               correct_col):
    def round_to_k(x):
        return f'{int(round(x, -3) / 1000)}'

    usecols = [student_col, skill_col, correct_col]
    if exercise_col != skill_col and exercise_col is not None:
        usecols.append(exercise_col)

    grouped = util.group_data(data[usecols].dropna(), student_col)
    stats_dict = {
        'Max attempts':
        grouped[student_col].apply(len).max(),
        'Students':
        len(grouped),
        'Records':
        round_to_k(len(data)),
        'Correct count':
        round_to_k(data[correct_col].sum()),
        'Exercise tags':
        len(data[exercise_col].unique())
        if exercise_col is not None else len(data[skill_col].unique()),
        'Skill tags':
        len(data[skill_col].unique())
        if exercise_col is not None and skill_col != exercise_col else '-'
    }

    stats = pd.DataFrame({k: [v] for k, v in stats_dict.items()})

    if format == 'json':
        stats_str = stats.to_json()
    elif format == 'txt':
        stats_str = stats.to_string(index=False)
    elif format == 'csv':
        stats_str = stats.to_csv(index=False)
    elif format == 'tex':
        stats_str = stats.to_latex(index=False)
    else:
        raise NotImplementedError(
            f'Statistics format {format} is not implemented.')

    print(stats_str)
    return stats_str
test_csv_filepath = os.path.join(
    test_data_dir, 'assistments2009-skill-builders-corrected_1000.csv')
test_simple_csv_filepath = os.path.join(test_data_dir, 'simple.csv')
test_asc_filepath = os.path.join(test_data_dir, 'test.asc')

data1 = pd.DataFrame({
    'user_id': [1, 2, 2, 2, 3, 3],
    'skill_id': [1, 1, 2, 1, 3, 4],
    'correct': [0, 1, 0, 0, 1, 0]
})
data2 = read_kt_data(test_csv_filepath, format='csv')
test_dataframes = [data1, data2]

grouped_data1 = pd.DataFrame({
    'user_id': [[1, 1], [2, 2, 2]],
    'skill_id': [[1, 2], [3, 3, 4]],
    'correct': [[1, 1], [0, 0, 0]]
})
grouped_data2 = util.group_data(
    read_kt_data(test_asc_filepath, 'asc', 'user_id', 'skill_id', 'correct'),
    'user_id')
grouped_test_dataframes = [grouped_data1, grouped_data2]


def dict_to_obj(my_dict, name='X'):
    return namedtuple(name.replace('-', '_'),
                      (k.replace('-', '_') for k in my_dict.keys()))(*[
                          x if not isinstance(x, dict) else dict_to_obj(x)
                          for x in my_dict.values()
                      ])