def objective(params): conf.model = { **conf.model, "clf_params": { "learning_rate": float(params["learning_rate"]), "max_bin": int(params["max_bin"]), "num_leaves": int(params["num_leaves"]), "min_child_samples": int(params["min_child_samples"]), "colsample_bytree": float(params["colsample_bytree"]), "subsample": float(params["subsample"]), "min_gain_to_split": float(params["min_gain_to_split"]), "reg_alpha": float(params["reg_alpha"]), "reg_lambda": float(params["reg_lambda"]), "boosting_type": "dart", "n_estimators": 10000, "max_depth": -1, "nthread": -1, "scale_pos_weight": 1, "is_unbalance": False, "silent": -1, "verbose": -1, "random_state": 0 } } pprint(conf.model.clf_params) model = LightGBM() score = model.train_and_predict_kfold(train_df, test_df, feats, 'TARGET', conf) return {'loss': -1.0 * score, 'status': STATUS_OK}
def train_and_predict_lightgbm(X_train_all, y_train_all, X_test): qcut_target = pd.qcut(y_train_all, SK_NUM, labels=False) # 学習前にy_trainに、log(y+1)で変換 y_train_all = np.log(y_train_all + 1) # np.log1p() でもOK y_preds = [] models = [] for seed in SEED: kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed) for train_index, valid_index in kf.split(X_train_all, qcut_target): X_train, X_valid = (X_train_all.iloc[train_index, :], X_train_all.iloc[valid_index, :]) y_train, y_valid = (y_train_all.iloc[train_index], y_train_all.iloc[valid_index]) # lgbmの実行 lgbm = LightGBM() y_pred, y_valid_pred, model = lgbm.train_and_predict( X_train, X_valid, y_train, y_valid, X_test, params) # 結果の保存 y_preds.append(y_pred) models.append(model) # スコア log_best(model, config['loss']) # CVスコア scores = [m.best_score['valid_0'][config['loss']] for m in models] score = sum(scores) / len(scores) print('===CV scores===') print(scores) print(score) logging.debug('===CV scores===') logging.debug(scores) logging.debug(score) # submitファイルの作成 ID_name = config['ID_name'] sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name]) y_sub = sum(y_preds) / len(y_preds) # 最後に、予測結果に対しexp(y)-1で逆変換 y_sub = np.exp(y_sub) - 1 # np.expm1() でもOK sub[target_name] = y_sub sub.to_csv('./data/output/sub_{0}_{1:%Y%m%d%H%M%S}_{2}.csv'.format( config['model'], now, score), index=False)
def classifier_lgbm(features, config, train_mode, **kwargs): if train_mode: features_train, features_valid = features if config.random_search.light_gbm.n_runs: transformer = RandomSearchOptimizer( LightGBM, config.light_gbm, train_input_keys=[], valid_input_keys=['X_valid', 'y_valid'], score_func=roc_auc_score, maximize=True, n_runs=config.random_search.light_gbm.n_runs, callbacks=[ NeptuneMonitor(**config.random_search.light_gbm.callbacks. neptune_monitor), SaveResults(**config.random_search.light_gbm.callbacks. save_results) ]) else: transformer = LightGBM(**config.light_gbm) light_gbm = Step(name='light_gbm', transformer=transformer, input_data=['input'], input_steps=[features_train, features_valid], adapter=Adapter({ 'X': E(features_train.name, 'features'), 'y': E('input', 'y'), 'feature_names': E(features_train.name, 'feature_names'), 'categorical_features': E(features_train.name, 'categorical_features'), 'X_valid': E(features_valid.name, 'features'), 'y_valid': E('input', 'y_valid'), }), cache_dirpath=config.env.cache_dirpath, **kwargs) else: light_gbm = Step(name='light_gbm', transformer=LightGBM(**config.light_gbm), input_steps=[features], adapter=Adapter({'X': E(features.name, 'features')}), cache_dirpath=config.env.cache_dirpath, **kwargs) return light_gbm
def model_train(df, train_target, params): lgbm = LightGBM(X=df, y=train_target, test_size=0.25, params=params, mlflow_tracking_server_uri=MLFLOW_TRACKING_SERVER_URI, experiment_name=MLFLOW_EXPERIMENT_NAME) logger.info("Training......") lgbm.train() logger.info("Evaluating....") lgbm.evaluate() logger.info("Saving model.....") lgbm.save_model(MODEL_FILE_PATH)
def train_and_predict(X_train_all, y_train_all, X_test, seed_num): model_params["seed"] = seed + seed_num oof_df = pd.DataFrame( index=[i for i in range(X_train_all.shape[0])], columns=[i for i in range(model_params["num_class"])]) y_preds = [] models = [] auc_scores = [] acc_scores = [] logloss_scores = [] kf = StratifiedKFold(n_splits=config["fold"], shuffle=True, random_state=model_params["seed"]) for fold_num, (train_index, valid_index) in enumerate(kf.split(X_train_all, y_train_all)): logger.debug(f"FOLD: {fold_num}") X_train, X_valid = (X_train_all.iloc[train_index, :], X_train_all.iloc[valid_index, :]) y_train, y_valid = (y_train_all.iloc[train_index], y_train_all.iloc[valid_index]) # train & inference if model_name == "lightgbm": classifier = LightGBM() elif model_name == "nn": classifier = NeuralNet(seed_num, fold_num) elif model_name == "cnn1d": classifier = CNN1d(seed_num, fold_num) elif model_name == "logistic_regression": classifier = LogisticRegressionClassifier() else: logger.debug("No such model name") raise Exception if "sampling" in config: if config["sampling"] == "SMOTE": X_train, y_train = SMOTE().fit_resample(X_train, y_train) elif config["sampling"] == "ADASYN": X_train, y_train = ADASYN().fit_resample(X_train, y_train) elif config["sampling"] == "RandomOverSampler": X_train, y_train = RandomOverSampler().fit_resample( X_train, y_train) else: raise y_pred, y_valid_pred, model = classifier.train_and_predict( X_train, X_valid, y_train, y_valid, X_test, model_params) # 結果の保存 y_preds.append(y_pred) oof_df.iloc[valid_index, :] = y_valid_pred models.append(model) # スコア auc_valid = evaluate_score(y_valid, y_valid_pred[:, 1], "auc") acc_valid = evaluate_score(y_valid, y_valid_pred.argmax(axis=1), "acc") logloss_valid = evaluate_score(y_valid, y_valid_pred[:, 1], "logloss") logger.debug( f"\t auc:{auc_valid}, acc: {acc_valid}, logloss: {logloss_valid}") auc_scores.append(auc_valid) acc_scores.append(acc_valid) logloss_scores.append(logloss_valid) # lightgbmなら重要度の出力 if model_name == "lightgbm": feature_imp_np = np.zeros(X_train_all.shape[1]) for model in models: feature_imp_np += model.feature_importance() / len(models) feature_imp = pd.DataFrame(sorted( zip(feature_imp_np, X_train_all.columns)), columns=['Value', 'Feature']) #print(feature_imp) logger.debug(feature_imp) plt.figure(figsize=(20, 10)) sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)) plt.title('LightGBM Features (avg over folds)') plt.tight_layout() plt.savefig(f'./logs/plots/features_{config_filename}.png') # CVスコア auc_score = sum(auc_scores) / len(auc_scores) acc_score = sum(acc_scores) / len(acc_scores) logloss_score = sum(logloss_scores) / len(logloss_scores) logger.debug('=== CV scores ===') logger.debug( f"\t auc:{auc_score}, acc: {acc_score}, logloss: {logloss_score}") # submitファイルの作成 sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name]) y_sub = sum(y_preds) / len(y_preds) sub[target_name] = y_sub[:, 1] ''' 確率ではなく番号を出力 if y_sub.shape[1] > 1: y_sub = np.argmax(y_sub, axis=1) ''' return oof_df, sub
sub_prediction = np.zeros(test_X.shape[0]) oof_scores = [] # LightGBM lgb_params = {"objective" : "regression", "boosting_type" : "dart", "metric" : "rmse", "num_leaves" : 15, "learning_rate" : 0.1, "max_depth" : 7, "bagging_fraction" : 0.9, "feature_fraction" : 0.9, "number_boosting_rounds" : 100, "early_stopping_rounds" : 10} light_gbm = LightGBM(**lgb_params) for fold_, (trn_, val_) in enumerate(folds): trn_X, trn_y = train_X.iloc[trn_], TARGET[trn_] val_X, val_y = train_X.iloc[val_], TARGET[val_] # la.fit(trn_X, trn_y) light_gbm.fit(trn_X, trn_y, val_X, val_y) # oof_prediction[val_] = la.predict(val_X) oof_prediction[val_] = light_gbm.transform(val_X)['prediction'] oof_prediction[oof_prediction < 0] = 0 # _preds = la.predict(test_X) _preds = light_gbm.transform(test_X)['prediction'] _preds[_preds < 0 ] = 0 sub_prediction += np.expm1(_preds) / len(folds) oof_scores.append(mean_squared_error(TARGET[val_], oof_prediction[val_])**0.5)
# external feature gap_ext = GapFeatureEngineerExternal() train_full = gap_ext.transform(train_all, train_all) test_full = gap_ext.transform(test_all, train_all) train_full = train_full.reindex( columns=features['target_feature']+features['id_feature']+\ sorted(features['categorical_feature']+features['numerical_feature'])) test_full = test_full.reindex( columns=features['target_feature']+features['id_feature']+\ sorted(features['categorical_feature']+features['numerical_feature'])) to_pickle(train_full, 'train_full.csv') to_pickle(test_full, 'test_full.csv') # model lgb_clf = LightGBM() lgb_clf.fit(train_full, clf=True) cat_clf = CatBoost() lgb_clf.fit(train_full, clf=True) train_nnd = pd.merge(train_full.nunique().reset_index(), train_full.isna().mean(axis=0).reset_index(), on=['index']).\ merge(train_full.dtypes.reset_index()).\
def stacking(X_train_all, y_train_all, X_test): qcut_target = pd.qcut(y_train_all, SK_NUM, labels=False) print(qcut_target) # 学習前にy_trainに、log(y+1)で変換 y_train_all = np.log(y_train_all + 1) # np.log1p() でもOK # base model の学習 base_models = config['base_models'] # 行数を揃えた空のデータフレームを作成 oof_df = pd.DataFrame(index=[i for i in range(X_train_all.shape[0]) ]) # meta model の X_train に y_preds_df = pd.DataFrame(index=[i for i in range(X_test.shape[0]) ]) # meta model の X_test に # base model ごとにK-fold して学習 for name, json_name in base_models.items(): one_config = json.load(open(f"./configs/{json_name}")) oof = np.zeros((X_train_all.shape[0], 1)) #y_preds = np.zeros((X_test.shape[0], 1)) y_preds = [] scores = [] for seed in SEED: kf = StratifiedKFold(n_splits=BASE_FOLDS, shuffle=True, random_state=seed) for train_index, valid_index in kf.split(X_train_all, qcut_target): X_train, X_valid = (X_train_all.iloc[train_index, :], X_train_all.iloc[valid_index, :]) y_train, y_valid = (y_train_all.iloc[train_index], y_train_all.iloc[valid_index]) if name == "LightGBM": model = LightGBM() elif name == "LinearRegression": model = LinearRegressionWrapper() elif name == "Lasso": model = LassoWrapper() elif name == "Ridge": model = RidgeWrapper() elif name == "ElasticNet": model = ElasticNetWrapper() elif name == "KernelRidge": model = KernelRidgeWrapper() elif name == "SVR": model = SVRWrapper() elif name == "XGBoost": model = XGBoost() elif name == "RandomForest": model = RandomForestWrapper() elif name == "GradientBoosting": model = GradientBoostingRegressorWrapper() elif name == "CatBoost": model = CatBoost() y_pred, y_valid_pred, m = model.train_and_predict( X_train, X_valid, y_train, y_valid, X_test, one_config["params"]) oof[valid_index, :] += y_valid_pred.reshape( len(y_valid_pred), 1) / len(SEED) #y_preds += (y_pred / FOLDS) y_preds.append(y_pred) # スコア rmse_valid = evaluate_score(y_valid, y_valid_pred, config['loss']) logging.debug(f"\tmodel:{name}, score: {rmse_valid}") scores.append(rmse_valid) score = sum(scores) / len(scores) print('===CV scores===') print(f"\tmodel: {name}, scores: {scores}") print(f"\tmodel: {name}, score: {score}") logging.debug('===CV scores===') logging.debug(f"\tmodel: {name}, scores: {scores}") logging.debug(f"\tmodel: {name}, score: {score}") oof_df[name] = oof y_preds_df[name] = sum(y_preds) / len(y_preds) # submitファイルの作成 ID_name = config['ID_name'] sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name]) y_sub = y_preds_df.mean(axis=1) # 最後に、予測結果に対しexp(y)-1で逆変換 y_sub = np.exp(y_sub) - 1 # np.expm1() でもOK sub[target_name] = y_sub sub.to_csv('./data/output/sub_blend.csv', index=False) # meta model の学習 # use_features_in_secondary = True oof_df = pd.concat([X_train_all, oof_df], axis=1) y_preds_df = pd.concat([X_test, y_preds_df], axis=1) y_preds = [] scores = [] for seed in SEED: kf = StratifiedKFold(n_splits=META_FOLDS, shuffle=True, random_state=seed) for train_index, valid_index in kf.split(X_train_all, qcut_target): X_train, X_valid = (oof_df.iloc[train_index, :], oof_df.iloc[valid_index, :]) y_train, y_valid = (y_train_all.iloc[train_index], y_train_all.iloc[valid_index]) name = config['meta_model'] if name == "LightGBM": model = LightGBM() elif name == "LinearRegression": model = LinearRegressionWrapper() elif name == "Lasso": model = LassoWrapper() elif name == "Ridge": model = RidgeWrapper() elif name == "ElasticNet": model = ElasticNetWrapper() elif name == "KernelRidge": model = KernelRidgeWrapper() elif name == "SVR": model = SVRWrapper() elif name == "XGBoost": model = XGBoost() elif name == "RandomForest": model = RandomForestWrapper() elif name == "GradientBoosting": model = GradientBoostingRegressorWrapper() elif name == "CatBoost": model = CatBoost() # 学習と推論。 y_preds_df を X_test に使用する y_pred, y_valid_pred, m = model.train_and_predict( X_train, X_valid, y_train, y_valid, y_preds_df, params) # 結果の保存 y_preds.append(y_pred) # スコア rmse_valid = evaluate_score(y_valid, y_valid_pred, config['loss']) logging.debug(f"\tscore: {rmse_valid}") scores.append(rmse_valid) score = sum(scores) / len(scores) print('===CV scores===') print(scores) print(score) logging.debug('===CV scores===') logging.debug(scores) logging.debug(score) # submitファイルの作成 ID_name = config['ID_name'] sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name]) y_sub = sum(y_preds) / len(y_preds) # 最後に、予測結果に対しexp(y)-1で逆変換 y_sub = np.exp(y_sub) - 1 # np.expm1() でもOK sub[target_name] = y_sub sub.to_csv('./data/output/sub_{0}_{1:%Y%m%d%H%M%S}_{2}.csv'.format( config['model'], now, score), index=False)