예제 #1
0
def main():
    reuslt = []
    for i in ['1', '2', '3', '4', '5', '6', '7', '8']:
        # 训练集
        train_X = get_exam_score('exam_score', 'course' + i)
        train_y = train_X['score']
        del train_X['score']

        # 测试集
        test_X = get_submission_s1('submission_s1', 'course' + i)

        # 规范columns
        columns = list(set(list(train_X.columns) + list(test_X.columns)))
        columns.sort()
        test_X = test_X.reindex(columns=columns).fillna(0)
        train_X = train_X.reindex(columns=columns).fillna(0)

        # 保存预测结果
        reuslt.extend(get_model(train_X, train_y, test_X).tolist())

    submit = load_data().get_test_s1('submission_s1', 'pd')
    submit['pred'] = reuslt
    submit.to_csv(load_data().get_project_path() +
                  '/data/test_s1/submission_s1_sample_stack.csv',
                  index=None,
                  encoding='utf-8')
예제 #2
0
    def __init__(
        self,
        args,
        path,
        test=False,
        slide=False,
    ):
        super().__init__(path["ja_chars"], path["font"])

        data_path = path["newspaper"]["test"] if test else path["newspaper"][
            "train"]
        data = load_data(data_path)
        self.data = data["data"]
        self.num_class = data["num_class"]

        self.char_len = args.char_len
        self.test = test
        self.slide = slide

        if args.character_encoder == "CAE":
            self.char2embedding = load_data(path["char2embedding"])
        else:
            self.font_img_dict[" "] = self.resize_font_img(
                self.char_to_font_img(" "))

        self.process = self.test_process if self.slide else self.train_process

        self.character_encoder = args.character_encoder
예제 #3
0
def get_submission_s1(filename, course_id: str, tag='pd', save=True):
    """
    处理submission_s1.csv文件
    :param filename:
    :param course_id:
    :param tag:
    :return:
    """
    # .h5文件保存路径
    save_path = load_data().get_project_path() + '/data/cache/%s_%s.h5' % (filename, course_id)

    if os.path.exists(save_path):
        submission_s1 = reduce_mem_usage(pd.read_hdf(path_or_buf=save_path, mode='r', key=course_id))
    else:
        submission_s1 = load_data().get_test_s1(filename, tag)

        # 获取 学生信息
        student = get_student('student')
        # 获取 课程信息
        course = get_course('course')

        # 提取/合并 性别信息
        submission_s1 = pd.merge(submission_s1, student, how='left', on='student_id')
        # 提取/合并 course_class信息
        submission_s1 = pd.merge(submission_s1, course, how='left', on='course')

        # 获取选中的 course_id
        submission_s1 = submission_s1[submission_s1['course'] == course_id]

        # 合并 section/category/complexity
        submission_s1 = merge_all_knowledge(submission_s1, course_id + '_exams')

        # 读取特定的course_exams.csv文件
        course_exams = get_course_exams(course_id + '_exams')

        # 处理exam_id
        submission_s1['exam_id'] = submission_s1['exam_id'].map(
            lambda x: dict(zip(course_exams['exam_id'], [i for i in range(len(course_exams['exam_id']))]))[x])

        submission_s1['pred'] = 0

        # 删除列值都相同的列
        submission_s1 = submission_s1.ix[:, (submission_s1 != submission_s1.ix[0]).any()]

        # 保存数据
        if save is True:
            submission_s1.to_hdf(path_or_buf=save_path, key=course_id)

    return submission_s1
예제 #4
0
def get_course_exams(filename, tag='pd'):
    """
    处理course1_exams.csv-course8_exams.csv
    :param filename:
    :return:
    """
    df = load_data().get_train_s1(filename, tag)

    return df
예제 #5
0
def get_student(filename, tag='pd'):
    """
    处理student.csv文件
    :param filename:
    :return:
    """
    df = load_data().get_train_s1(filename, tag)

    return df
예제 #6
0
def get_course(filename, tag='pd'):
    """
    处理course.csv文件
    :param filename:
    :return:
    """
    df = load_data().get_train_s1(filename, tag)
    df = label_encoding(df, columns=[u'course_class'])

    return df
예제 #7
0
    def __init__(self, chars_path, font_name):
        self.chars = load_data(chars_path)
        self.font_name = font_name
        self.font_size = 64

        self.font = ImageFont.truetype(font=font_name,
                                       size=int(self.font_size * 0.9),
                                       encoding="utf-8")

        self.font_img_dict = {
            char: self.resize_font_img(self.char_to_font_img(char))
            for char in self.chars
        }
예제 #8
0
def get_all_knowledge(filename, tag='pd'):
    """
    处理all_knowledge.csv文件
    :param filename:
    :return:
    """
    df = load_data().get_train_s1(filename, tag)

    # v1.0需要
    # for feature in ['section', 'category']:
    #     df[feature] = [x.split(':')[-1] for x in df[feature]]

    return df
예제 #9
0
    #     reuslt.extend(predictions.tolist())
    #
    # submit = load_data().get_test_s1('submission_s1', 'pd')
    # submit['pred'] = reuslt
    # submit.to_csv(load_data().get_project_path() + '/data/test_s1/submission_s1_sample_mean_median.csv', index=None,
    #               encoding='utf-8')

    ########################################################mean-median-xgb#################################################################
    # data1 = load_data().get_test_s1('submission_s1_sample_xgb', 'pd')
    # data2 = load_data().get_test_s1('submission_s1_sample_baseline', 'pd')
    #
    # data1['pred'] = (data1['pred'] * 0.5 + data2['pred'] * 0.5)
    # data1.to_csv(load_data().get_project_path() + '/data/test_s1/submission_s1_sample_baseline_xgb.csv', index=None,
    #              encoding='utf-8')
    #
    # print(time.clock() - start)

    ########################################################stack-and-baseline#################################################################
    data1 = load_data().get_test_s1('submission_s1_sample_xgb', 'pd')
    data2 = load_data().get_test_s1('submission_s1_sample_baseline_1', 'pd')

    data1['pred'] = (data1['pred'] + 32) * 0.3 + data2['pred'] * 0.7
    # data1['pred'] = [round(i) for i in list(data1['pred'])]

    data1.to_csv(load_data().get_project_path() +
                 '/data/test_s1/submission_s1_sample_stack_baseline_1_xgb.csv',
                 index=None,
                 encoding='utf-8')

    print(time.clock() - start)
예제 #10
0
def get_exam_score(filename, course_id: str, tag='pd', save=True):
    """
    处理exam_score.csv  course_id 只要一个值 如: 'course1'
    :param filename:
    :param course_id:
    :param tag:
    :param save:
    :return:
    """
    # .h5文件保存路径
    save_path = load_data().get_project_path() + '/data/cache/%s_%s.h5' % (filename, course_id)

    if os.path.exists(save_path):
        exam_score = reduce_mem_usage(pd.read_hdf(path_or_buf=save_path, mode='r', key=course_id))
    else:
        exam_score = load_data().get_train_s1(filename, tag)

        # 获取 学生信息
        student = get_student('student')
        # 获取 课程信息
        course = get_course('course')

        # 提取/合并 性别信息
        exam_score = pd.merge(exam_score, student, how='left', on='student_id')
        # 提取/合并 course_class信息
        exam_score = pd.merge(exam_score, course, how='left', on='course')

        # 获取选中的 course_id
        exam_score = exam_score[exam_score['course'] == course_id]

        # 合并 section/category/complexity
        exam_score = merge_all_knowledge(exam_score, course_id + '_exams')

        # 读取特定的course_exams.csv文件
        course_exams = get_course_exams(course_id + '_exams')

        # 处理exam_id
        exam_score['exam_id'] = exam_score['exam_id'].map(
            lambda x: dict(zip(course_exams['exam_id'], [i for i in range(len(course_exams['exam_id']))]))[x])

        # 取均值
        mean_value = get_mean_value(exam_score)

        # 使用均值来填充空值
        result = None
        for tmp in exam_score.groupby('student_id'):
            tmp[1]['score'].replace(0, mean_value[tmp[0]], inplace=True)
            if result is None:
                result = tmp[1]
            else:
                result = pd.concat([result, tmp[1]], axis=0)

        result.reset_index(drop=True, inplace=True)

        exam_score = result

        # log1p就是log(1+x),用来对得分进行数据预处理,它的好处是转化后的数据更加服从高斯分布,有利于后续的分类结果。
        # 需要注意,最后需要将预测出的平滑数据还原,而还原过程就是log1p的逆运算expm1。
        exam_score["score"] = np.log1p(exam_score["score"])

        # 删除列值都相同的列
        exam_score = exam_score.ix[:, (exam_score != exam_score.ix[0]).any()]

        # 保存数据
        if save is True:
            exam_score.to_hdf(path_or_buf=save_path, key=course_id)

    return exam_score
예제 #11
0
    state['{}_acc'.format(mode)].append(correct / len(data_loader.dataset))


if __name__ == '__main__':
    # get arguments
    args = parse()
    ndegree = args.pdegree

    # set seeds
    torch.manual_seed(args.seed)
    if use_cuda:
        torch.cuda.manual_seed_all(args.seed)

    # load dataset
    train_loader, valid_loader, test_loader = \
      load_data(args.batch_size, args.test_batch_size, args.dataset)

    # load model
    model = load_model(args.model, args.dataset)
    if use_cuda:
        model.cuda()

    # count total number of parameters
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print('\nTotal number of parameters: {}\n'.format(params))

    print_model_setting(args)

    # set optimizer
    if args.optimizer == 'Adam':