def plot_all(self): path = "./data/samsung_contest/data_tansformed.csv" df = load_samsung(path) reg_cols = [ 'c02_사망자수', 'c03_사상자수', 'c04_중상자수', 'c05_경상자수', 'c06_부상신고자수', ] label_encoder_cols = [] for k in df.columns: if '_label' in k: label_encoder_cols += [k] onehot_col = [] for k in df.columns: if '_onehot' in k: onehot_col += [k] x_cols = reg_cols + onehot_col origin_cols = [ 'c00_주야', 'c01_요일', 'c02_사망자수', 'c03_사상자수', 'c04_중상자수', 'c05_경상자수', 'c06_부상신고자수', 'c07_발생지시도', 'c08_발생지시군구', 'c09_사고유형_대분류', 'c10_사고유형_중분류', 'c11_법규위반', 'c12_도로형태_대분류', 'c13_도로형태', 'c14_당사자종별_1당_대분류', 'c15_당사자종별_2당_대분류', ] # pprint(label_encoder_cols) # pprint(onehot_col) # pprint(x_cols) # pprint(origin_cols) plot = PlotTools() for key in origin_cols: # plot.dist(df, key, title=f'dist_{key}') plot.count(df, key, title=f'count_{key}') for a_key in origin_cols: for b_key in origin_cols: try: plot.count(df, a_key, b_key, title=f'count_{a_key}_groupby_{b_key}') except BaseException as e: print(a_key, b_key, e)
def test_df(self): if self._test_df is None: path = path_join(path_head, 'test_kor.csv') test_df = load_samsung(path) test_df = add_col_num(test_df) test_df = self.fill_rand(test_df) test_df = self.fill_inference_able(test_df) save_samsung(test_df, './test.csv') self._test_df = test_df return self._test_df
def transform_to_result(self, predict_df): result_df = load_samsung(path_join(path_head, 'result_kor.csv')) size = len(result_df) predict_cols = list(predict_df.columns) result_cols = [str.upper(a) for a in 'abcdefghijklmnopqrstuvwxyz'] result_cols = result_cols[:len(predict_cols)] result_col_tp_result_col = dict(zip(result_cols, predict_cols)) for i in range(size): a = result_df.loc[i, :] row = int(a['열']) - 2 col = a['행'] predict_col = result_col_tp_result_col[col] result_df.loc[i, '값'] = predict_df.loc[row, predict_col] # print(row, col, type(row), type(col), predict_col) return result_df
def fill_rand(self, test_df): path = path_join(path_head, 'data_init.csv') train_df = load_samsung(path) train_df['id'] = [0] * len(train_df) g = train_df.groupby(['c07_발생지시도', 'c08_발생지시군구'])['id'].count() mapper = {} for key, val in list(g.index): if key not in mapper: mapper[key] = [val] else: mapper[key] += [val] city_to_city_sub = mapper g = train_df.groupby('c07_발생지시도')['id'].count() city = list(g.index) g = train_df.groupby('c08_발생지시군구')['id'].count() city_sub = list(g.index) for idx in range(len(test_df)): if self.is_nan(test_df.loc[idx, 'c00_주야']): test_df.loc[idx, 'c00_주야'] = np.random.choice(['주간', '야간'], 1)[0] elif self.is_nan(test_df.loc[idx, 'c01_요일']): day = ['월', '화', '수', '목', '금', '토', '일'] test_df.loc[idx, 'c01_요일'] = np.random.choice(day, 1)[0] elif self.is_nan(test_df.loc[idx, 'c07_발생지시도']) and self.is_nan(test_df.loc[idx, 'c08_발생지시군구']): test_df.loc[idx, 'c07_발생지시도'] = np.random.choice(city, 1)[0] test_df.loc[idx, 'c08_발생지시군구'] = np.random.choice(city_sub, 1)[0] elif not self.is_nan(test_df.loc[idx, 'c07_발생지시도']) and self.is_nan(test_df.loc[idx, 'c08_발생지시군구']): choice = city_to_city_sub[test_df.loc[idx, 'c07_발생지시도']] test_df.loc[idx, 'c08_발생지시군구'] = np.random.choice(choice, 1)[0] return test_df
def predict(self, cache=True): test_df_trans = self.test_df_trans clf_dicts = self.model path = path_join(path_head, 'predict.csv') if os.path.exists(path) and cache: print('predict cache found, use cache') df = load_samsung(path) return df print('predict') for idx in range(len(test_df_trans)): p_type = self.get_p_type(test_df_trans, idx) p_type_str = str(p_type) print(f'predict {idx}, {p_type_str}') x_cols = list(self.x_cols) for y_col in p_type: x_cols.remove(y_col) for y_col in p_type: print(f'predict {y_col}') x_df = DF(test_df_trans.loc[[idx], x_cols]) x_df = x_df.reset_index(drop=True) clf = clf_dicts[p_type_str][y_col] print(clf) clf_name = 'XGBoostClf' predict = clf.predict(x_df)[clf_name][0][0] test_df_trans.loc[idx, y_col] = predict print(f'predict = {predict}, at_df:{test_df_trans.loc[idx, y_col]}') save_samsung(test_df_trans, path) return test_df_trans
def test_bench(self): predict_df = load_samsung(path_join(path_head, 'predict.csv')) transformed_result_df = self.transform_to_result(predict_df) save_samsung(transformed_result_df, path_join(path_head, 'result_predict.csv'))