예제 #1
0
    def plot_all(self):
        path = "./data/samsung_contest/data_tansformed.csv"
        df = load_samsung(path)

        reg_cols = [
            'c02_사망자수',
            'c03_사상자수',
            'c04_중상자수',
            'c05_경상자수',
            'c06_부상신고자수',
        ]

        label_encoder_cols = []
        for k in df.columns:
            if '_label' in k:
                label_encoder_cols += [k]

        onehot_col = []
        for k in df.columns:
            if '_onehot' in k:
                onehot_col += [k]

        x_cols = reg_cols + onehot_col

        origin_cols = [
            'c00_주야',
            'c01_요일',
            'c02_사망자수',
            'c03_사상자수',
            'c04_중상자수',
            'c05_경상자수',
            'c06_부상신고자수',
            'c07_발생지시도',
            'c08_발생지시군구',
            'c09_사고유형_대분류',
            'c10_사고유형_중분류',
            'c11_법규위반',
            'c12_도로형태_대분류',
            'c13_도로형태',
            'c14_당사자종별_1당_대분류',
            'c15_당사자종별_2당_대분류',
        ]

        # pprint(label_encoder_cols)
        # pprint(onehot_col)
        # pprint(x_cols)
        # pprint(origin_cols)

        plot = PlotTools()
        for key in origin_cols:
            # plot.dist(df, key, title=f'dist_{key}')
            plot.count(df, key, title=f'count_{key}')

        for a_key in origin_cols:
            for b_key in origin_cols:
                try:
                    plot.count(df, a_key, b_key, title=f'count_{a_key}_groupby_{b_key}')
                except BaseException as e:
                    print(a_key, b_key, e)
예제 #2
0
    def test_df(self):
        if self._test_df is None:
            path = path_join(path_head, 'test_kor.csv')
            test_df = load_samsung(path)
            test_df = add_col_num(test_df)
            test_df = self.fill_rand(test_df)
            test_df = self.fill_inference_able(test_df)
            save_samsung(test_df, './test.csv')

            self._test_df = test_df

        return self._test_df
예제 #3
0
    def transform_to_result(self, predict_df):
        result_df = load_samsung(path_join(path_head, 'result_kor.csv'))
        size = len(result_df)
        predict_cols = list(predict_df.columns)
        result_cols = [str.upper(a) for a in 'abcdefghijklmnopqrstuvwxyz']
        result_cols = result_cols[:len(predict_cols)]

        result_col_tp_result_col = dict(zip(result_cols, predict_cols))

        for i in range(size):
            a = result_df.loc[i, :]
            row = int(a['열']) - 2
            col = a['행']
            predict_col = result_col_tp_result_col[col]
            result_df.loc[i, '값'] = predict_df.loc[row, predict_col]
            # print(row, col, type(row), type(col), predict_col)

        return result_df
예제 #4
0
    def fill_rand(self, test_df):
        path = path_join(path_head, 'data_init.csv')
        train_df = load_samsung(path)
        train_df['id'] = [0] * len(train_df)

        g = train_df.groupby(['c07_발생지시도', 'c08_발생지시군구'])['id'].count()
        mapper = {}
        for key, val in list(g.index):
            if key not in mapper:
                mapper[key] = [val]
            else:
                mapper[key] += [val]

        city_to_city_sub = mapper

        g = train_df.groupby('c07_발생지시도')['id'].count()
        city = list(g.index)

        g = train_df.groupby('c08_발생지시군구')['id'].count()
        city_sub = list(g.index)

        for idx in range(len(test_df)):
            if self.is_nan(test_df.loc[idx, 'c00_주야']):
                test_df.loc[idx, 'c00_주야'] = np.random.choice(['주간', '야간'], 1)[0]

            elif self.is_nan(test_df.loc[idx, 'c01_요일']):
                day = ['월', '화', '수', '목', '금', '토', '일']
                test_df.loc[idx, 'c01_요일'] = np.random.choice(day, 1)[0]

            elif self.is_nan(test_df.loc[idx, 'c07_발생지시도']) and self.is_nan(test_df.loc[idx, 'c08_발생지시군구']):
                test_df.loc[idx, 'c07_발생지시도'] = np.random.choice(city, 1)[0]
                test_df.loc[idx, 'c08_발생지시군구'] = np.random.choice(city_sub, 1)[0]

            elif not self.is_nan(test_df.loc[idx, 'c07_발생지시도']) and self.is_nan(test_df.loc[idx, 'c08_발생지시군구']):
                choice = city_to_city_sub[test_df.loc[idx, 'c07_발생지시도']]
                test_df.loc[idx, 'c08_발생지시군구'] = np.random.choice(choice, 1)[0]

        return test_df
예제 #5
0
    def predict(self, cache=True):
        test_df_trans = self.test_df_trans
        clf_dicts = self.model

        path = path_join(path_head, 'predict.csv')
        if os.path.exists(path) and cache:
            print('predict cache found, use cache')
            df = load_samsung(path)
            return df

        print('predict')
        for idx in range(len(test_df_trans)):
            p_type = self.get_p_type(test_df_trans, idx)
            p_type_str = str(p_type)
            print(f'predict {idx}, {p_type_str}')

            x_cols = list(self.x_cols)

            for y_col in p_type:
                x_cols.remove(y_col)

            for y_col in p_type:
                print(f'predict {y_col}')

                x_df = DF(test_df_trans.loc[[idx], x_cols])
                x_df = x_df.reset_index(drop=True)

                clf = clf_dicts[p_type_str][y_col]
                print(clf)

                clf_name = 'XGBoostClf'
                predict = clf.predict(x_df)[clf_name][0][0]
                test_df_trans.loc[idx, y_col] = predict
                print(f'predict = {predict}, at_df:{test_df_trans.loc[idx, y_col]}')

        save_samsung(test_df_trans, path)

        return test_df_trans
예제 #6
0
 def test_bench(self):
     predict_df = load_samsung(path_join(path_head, 'predict.csv'))
     transformed_result_df = self.transform_to_result(predict_df)
     save_samsung(transformed_result_df, path_join(path_head, 'result_predict.csv'))