def test_run_train_test(): df_train, train_params = initial_processing(train_data, mode='train') df_test, _ = initial_processing(test_data, mode='train') tf = CatTransformer(train_params['cat_cols']) tf.fit(df_train) df_train_tf = tf.transform(df_train) df_test_tf = tf.transform(df_test) assert set(df_train_tf.columns.values) == set(df_test_tf.columns.values)
def run_train_test(ds_name, metric, params, obj): path = _DATA_PATH + ds_name with Profiler('initial feature selection'): x_initial_raw, y_initial, _ = load_data(f'{path}/train.csv', mode='train', sample=_SAMPLE) x_initial, ini_params = initial_processing(x_initial_raw, mode='train') tf = CatTransformer(ini_params['cat_cols']) # tf.fit(x_initial) x_initial_tf = tf.fit_transform(x_initial) selected_features, feat_list = ols_selection(x_initial_tf, y_initial, obj) hp_params = hyperopt_lgb(x_initial_tf[feat_list], y_initial, params, obj) print('selected features=', len(selected_features)) x_train_raw, y_train, _ = load_data(f'{path}/train.csv', mode='train', sample=_SAMPLE, used_cols=selected_features) x_test_raw, _, _ = load_data(f'{path}/test.csv', mode='test') y_test = load_test_label(f'{path}/test-target.csv') x_train, train_params = initial_processing(x_train_raw, mode='train') x_test, test_params = initial_processing(x_test_raw, mode='test') with Profiler('fit transform cat columns'): x_test_rein = x_test.reindex(columns=train_params['used_cols']) tf = CatTransformer(train_params['cat_cols']) tf.fit(x_train) x_train_tf = tf.transform(x_train) x_test_tf = tf.transform(x_test_rein) with Profiler('run train'): model = lgb.train(hp_params, lgb.Dataset(x_train_tf, label=y_train), 600) with Profiler('predict'): y_train_out = model.predict(x_train_tf) y_test_out = model.predict(x_test_tf) train_err = metric(y_train, y_train_out) test_err = metric(y_test, y_test_out) return train_err, test_err
def test_cat_fit_test(): _, params = initial_processing(train_data, mode='train') cat_cols = params['cat_cols'] tf = CatTransformer(cat_cols) tf.fit(train_data[cat_cols]) res_df = tf.transform(test_data) assert res_df['string_0'][0] == 2 / 5 assert res_df['string_0'][1] == 1 / 5 assert res_df['string_0'][2] == 2 / 5 assert np.isnan(res_df['string_0'][3]) assert np.isnan(res_df['string_0'][4]) assert np.isnan(res_df['string_1']).all()
def test_date_col_processing(): df, params = initial_processing(train_data, mode='train') assert df['date_month_datetime_0'][0] == 1 assert df['date_month_datetime_0'][2] == 2 assert df['date_month_datetime_0'][3] == 11 assert df['date_weekday_datetime_0'][1] == 2 assert df['date_weekday_datetime_0'][3] == 0 assert df['date_weekday_datetime_0'][4] == 6 assert df['date_day_datetime_0'][1] == 3 assert df['date_day_datetime_0'][2] == 14 assert df['date_day_datetime_0'][4] == 30
"num_leaves": 200, "feature_fraction": 0.70, "bagging_fraction": 0.70, 'bagging_freq': 4, "max_depth": -1, "verbosity": -1, "reg_alpha": 0.3, "reg_lambda": 0.1, "min_child_weight": 10, 'zero_as_missing': True, 'num_threads': 4, 'seed': 1 } with Profiler('load data and perform feature selection'): x_ini_raw, y_ini, _ = load_data(args.train_csv, sample=_SAMPLE) x_initial, ini_params = initial_processing(x_ini_raw, mode='train') tf = CatTransformer(ini_params['cat_cols']) x_initial_tf = tf.fit_transform(x_initial) selected_features, feat_list = ols_selection(x_initial_tf, y_ini, obj) hp_params = hyperopt_lgb(x_initial_tf[feat_list], y_ini, params, obj) print(f'{ len(selected_features)} features selected') df_X_raw, df_y, _ = load_data(args.train_csv, used_cols=selected_features) x_train, train_params = initial_processing(df_X_raw, mode='train') with Profiler('fit transform cat columns'): tf = CatTransformer(train_params['cat_cols']) tf.fit(x_train) x_train_tf = tf.transform(x_train) with Profiler('run train'):
def test_cat_cols_frequency(): df, params = initial_processing(train_data, mode='train') cat_cols = params['cat_cols'] assert set(cat_cols) == {'id_0', 'string_0', 'string_1'}
parser = argparse.ArgumentParser() parser.add_argument('--test-csv', required=True) parser.add_argument('--prediction-csv', type=argparse.FileType('w'), required=True) parser.add_argument('--model-dir', required=True) args = parser.parse_args() start_time = time.time() # load model model_config_filename = os.path.join(args.model_dir, 'model_config.pkl') with open(model_config_filename, 'rb') as fin: model_config = pickle.load(fin) model = model_config['model'] tf = model_config['cat_tf'] train_params = model_config['train_params'] x_test_raw, _, df = load_data(args.test_csv, mode='test') x_test, test_params = initial_processing(x_test_raw, mode='test') with Profiler('transform cat columns'): x_test_rein = x_test.reindex(columns=train_params['used_cols']) x_test_tf = tf.transform(x_test_rein) df['prediction'] = model.predict(x_test_tf) df[['line_id', 'prediction']].to_csv(args.prediction_csv, index=False) print('Prediction time: {:0.2f}'.format(time.time() - start_time))