예제 #1
0
    def objective(params):
        conf.model = {
            **conf.model, "clf_params": {
                "learning_rate": float(params["learning_rate"]),
                "max_bin": int(params["max_bin"]),
                "num_leaves": int(params["num_leaves"]),
                "min_child_samples": int(params["min_child_samples"]),
                "colsample_bytree": float(params["colsample_bytree"]),
                "subsample": float(params["subsample"]),
                "min_gain_to_split": float(params["min_gain_to_split"]),
                "reg_alpha": float(params["reg_alpha"]),
                "reg_lambda": float(params["reg_lambda"]),
                "boosting_type": "dart",
                "n_estimators": 10000,
                "max_depth": -1,
                "nthread": -1,
                "scale_pos_weight": 1,
                "is_unbalance": False,
                "silent": -1,
                "verbose": -1,
                "random_state": 0
            }
        }
        pprint(conf.model.clf_params)

        model = LightGBM()
        score = model.train_and_predict_kfold(train_df, test_df, feats,
                                              'TARGET', conf)
        return {'loss': -1.0 * score, 'status': STATUS_OK}
예제 #2
0
def train_and_predict_lightgbm(X_train_all, y_train_all, X_test):
    qcut_target = pd.qcut(y_train_all, SK_NUM, labels=False)

    # 学習前にy_trainに、log(y+1)で変換
    y_train_all = np.log(y_train_all + 1)  # np.log1p() でもOK

    y_preds = []
    models = []
    for seed in SEED:
        kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed)
        for train_index, valid_index in kf.split(X_train_all, qcut_target):
            X_train, X_valid = (X_train_all.iloc[train_index, :],
                                X_train_all.iloc[valid_index, :])
            y_train, y_valid = (y_train_all.iloc[train_index],
                                y_train_all.iloc[valid_index])

            # lgbmの実行
            lgbm = LightGBM()
            y_pred, y_valid_pred, model = lgbm.train_and_predict(
                X_train, X_valid, y_train, y_valid, X_test, params)

            # 結果の保存
            y_preds.append(y_pred)
            models.append(model)

            # スコア
            log_best(model, config['loss'])

    # CVスコア
    scores = [m.best_score['valid_0'][config['loss']] for m in models]
    score = sum(scores) / len(scores)
    print('===CV scores===')
    print(scores)
    print(score)
    logging.debug('===CV scores===')
    logging.debug(scores)
    logging.debug(score)

    # submitファイルの作成
    ID_name = config['ID_name']
    sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name])

    y_sub = sum(y_preds) / len(y_preds)

    # 最後に、予測結果に対しexp(y)-1で逆変換
    y_sub = np.exp(y_sub) - 1  # np.expm1() でもOK

    sub[target_name] = y_sub

    sub.to_csv('./data/output/sub_{0}_{1:%Y%m%d%H%M%S}_{2}.csv'.format(
        config['model'], now, score),
               index=False)
예제 #3
0
def classifier_lgbm(features, config, train_mode, **kwargs):
    if train_mode:
        features_train, features_valid = features
        if config.random_search.light_gbm.n_runs:
            transformer = RandomSearchOptimizer(
                LightGBM,
                config.light_gbm,
                train_input_keys=[],
                valid_input_keys=['X_valid', 'y_valid'],
                score_func=roc_auc_score,
                maximize=True,
                n_runs=config.random_search.light_gbm.n_runs,
                callbacks=[
                    NeptuneMonitor(**config.random_search.light_gbm.callbacks.
                                   neptune_monitor),
                    SaveResults(**config.random_search.light_gbm.callbacks.
                                save_results)
                ])
        else:
            transformer = LightGBM(**config.light_gbm)

        light_gbm = Step(name='light_gbm',
                         transformer=transformer,
                         input_data=['input'],
                         input_steps=[features_train, features_valid],
                         adapter=Adapter({
                             'X':
                             E(features_train.name, 'features'),
                             'y':
                             E('input', 'y'),
                             'feature_names':
                             E(features_train.name, 'feature_names'),
                             'categorical_features':
                             E(features_train.name, 'categorical_features'),
                             'X_valid':
                             E(features_valid.name, 'features'),
                             'y_valid':
                             E('input', 'y_valid'),
                         }),
                         cache_dirpath=config.env.cache_dirpath,
                         **kwargs)
    else:
        light_gbm = Step(name='light_gbm',
                         transformer=LightGBM(**config.light_gbm),
                         input_steps=[features],
                         adapter=Adapter({'X': E(features.name, 'features')}),
                         cache_dirpath=config.env.cache_dirpath,
                         **kwargs)
    return light_gbm
예제 #4
0
def model_train(df, train_target, params):
    lgbm = LightGBM(X=df, y=train_target, test_size=0.25, params=params,
                    mlflow_tracking_server_uri=MLFLOW_TRACKING_SERVER_URI, experiment_name=MLFLOW_EXPERIMENT_NAME)
    logger.info("Training......")
    lgbm.train()
    logger.info("Evaluating....")
    lgbm.evaluate()
    logger.info("Saving model.....")
    lgbm.save_model(MODEL_FILE_PATH)
예제 #5
0
def train_and_predict(X_train_all, y_train_all, X_test, seed_num):
    model_params["seed"] = seed + seed_num
    oof_df = pd.DataFrame(
        index=[i for i in range(X_train_all.shape[0])],
        columns=[i for i in range(model_params["num_class"])])
    y_preds = []

    models = []
    auc_scores = []
    acc_scores = []
    logloss_scores = []

    kf = StratifiedKFold(n_splits=config["fold"],
                         shuffle=True,
                         random_state=model_params["seed"])
    for fold_num, (train_index,
                   valid_index) in enumerate(kf.split(X_train_all,
                                                      y_train_all)):
        logger.debug(f"FOLD: {fold_num}")
        X_train, X_valid = (X_train_all.iloc[train_index, :],
                            X_train_all.iloc[valid_index, :])
        y_train, y_valid = (y_train_all.iloc[train_index],
                            y_train_all.iloc[valid_index])

        # train & inference
        if model_name == "lightgbm":
            classifier = LightGBM()
        elif model_name == "nn":
            classifier = NeuralNet(seed_num, fold_num)
        elif model_name == "cnn1d":
            classifier = CNN1d(seed_num, fold_num)
        elif model_name == "logistic_regression":
            classifier = LogisticRegressionClassifier()
        else:
            logger.debug("No such model name")
            raise Exception

        if "sampling" in config:
            if config["sampling"] == "SMOTE":
                X_train, y_train = SMOTE().fit_resample(X_train, y_train)
            elif config["sampling"] == "ADASYN":
                X_train, y_train = ADASYN().fit_resample(X_train, y_train)
            elif config["sampling"] == "RandomOverSampler":
                X_train, y_train = RandomOverSampler().fit_resample(
                    X_train, y_train)
            else:
                raise

        y_pred, y_valid_pred, model = classifier.train_and_predict(
            X_train, X_valid, y_train, y_valid, X_test, model_params)

        # 結果の保存
        y_preds.append(y_pred)
        oof_df.iloc[valid_index, :] = y_valid_pred
        models.append(model)

        # スコア
        auc_valid = evaluate_score(y_valid, y_valid_pred[:, 1], "auc")
        acc_valid = evaluate_score(y_valid, y_valid_pred.argmax(axis=1), "acc")
        logloss_valid = evaluate_score(y_valid, y_valid_pred[:, 1], "logloss")
        logger.debug(
            f"\t auc:{auc_valid}, acc: {acc_valid}, logloss: {logloss_valid}")
        auc_scores.append(auc_valid)
        acc_scores.append(acc_valid)
        logloss_scores.append(logloss_valid)

    # lightgbmなら重要度の出力
    if model_name == "lightgbm":
        feature_imp_np = np.zeros(X_train_all.shape[1])
        for model in models:
            feature_imp_np += model.feature_importance() / len(models)
        feature_imp = pd.DataFrame(sorted(
            zip(feature_imp_np, X_train_all.columns)),
                                   columns=['Value', 'Feature'])
        #print(feature_imp)
        logger.debug(feature_imp)
        plt.figure(figsize=(20, 10))
        sns.barplot(x="Value",
                    y="Feature",
                    data=feature_imp.sort_values(by="Value", ascending=False))
        plt.title('LightGBM Features (avg over folds)')
        plt.tight_layout()
        plt.savefig(f'./logs/plots/features_{config_filename}.png')

    # CVスコア
    auc_score = sum(auc_scores) / len(auc_scores)
    acc_score = sum(acc_scores) / len(acc_scores)
    logloss_score = sum(logloss_scores) / len(logloss_scores)
    logger.debug('=== CV scores ===')
    logger.debug(
        f"\t auc:{auc_score}, acc: {acc_score}, logloss: {logloss_score}")

    # submitファイルの作成
    sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name])
    y_sub = sum(y_preds) / len(y_preds)
    sub[target_name] = y_sub[:, 1]
    ''' 確率ではなく番号を出力
    if y_sub.shape[1] > 1:
        y_sub = np.argmax(y_sub, axis=1)
    '''

    return oof_df, sub
예제 #6
0
파일: MAIN.py 프로젝트: fangduan/GStore
sub_prediction = np.zeros(test_X.shape[0])
oof_scores = []

# LightGBM
lgb_params = {"objective" : "regression",
              "boosting_type" : "dart",
              "metric" : "rmse",
              "num_leaves" : 15,
              "learning_rate" : 0.1,
              "max_depth" : 7,
              "bagging_fraction" : 0.9,
              "feature_fraction" : 0.9,
              "number_boosting_rounds" : 100,
              "early_stopping_rounds" : 10}

light_gbm = LightGBM(**lgb_params)

for fold_, (trn_, val_) in enumerate(folds):
    trn_X, trn_y = train_X.iloc[trn_], TARGET[trn_]
    val_X, val_y = train_X.iloc[val_], TARGET[val_]

    # la.fit(trn_X, trn_y)
    light_gbm.fit(trn_X, trn_y, val_X, val_y)
    # oof_prediction[val_] = la.predict(val_X)
    oof_prediction[val_] = light_gbm.transform(val_X)['prediction']
    oof_prediction[oof_prediction < 0] = 0
    # _preds = la.predict(test_X)
    _preds = light_gbm.transform(test_X)['prediction']
    _preds[_preds < 0 ] = 0
    sub_prediction += np.expm1(_preds) / len(folds)
    oof_scores.append(mean_squared_error(TARGET[val_], oof_prediction[val_])**0.5)
예제 #7
0
# external feature
gap_ext = GapFeatureEngineerExternal()
train_full = gap_ext.transform(train_all, train_all)
test_full = gap_ext.transform(test_all, train_all)
train_full = train_full.reindex(
        columns=features['target_feature']+features['id_feature']+\
            sorted(features['categorical_feature']+features['numerical_feature']))
test_full = test_full.reindex(
        columns=features['target_feature']+features['id_feature']+\
            sorted(features['categorical_feature']+features['numerical_feature']))
to_pickle(train_full, 'train_full.csv')
to_pickle(test_full, 'test_full.csv')

# model
lgb_clf = LightGBM()
lgb_clf.fit(train_full, clf=True)

cat_clf = CatBoost()
lgb_clf.fit(train_full, clf=True)







train_nnd = pd.merge(train_full.nunique().reset_index(),
                     train_full.isna().mean(axis=0).reset_index(),
                     on=['index']).\
                     merge(train_full.dtypes.reset_index()).\
예제 #8
0
def stacking(X_train_all, y_train_all, X_test):
    qcut_target = pd.qcut(y_train_all, SK_NUM, labels=False)

    print(qcut_target)
    # 学習前にy_trainに、log(y+1)で変換
    y_train_all = np.log(y_train_all + 1)  # np.log1p() でもOK

    # base model の学習
    base_models = config['base_models']
    # 行数を揃えた空のデータフレームを作成
    oof_df = pd.DataFrame(index=[i for i in range(X_train_all.shape[0])
                                 ])  # meta model の X_train に
    y_preds_df = pd.DataFrame(index=[i for i in range(X_test.shape[0])
                                     ])  # meta model の X_test に

    # base model ごとにK-fold して学習
    for name, json_name in base_models.items():
        one_config = json.load(open(f"./configs/{json_name}"))

        oof = np.zeros((X_train_all.shape[0], 1))
        #y_preds = np.zeros((X_test.shape[0], 1))
        y_preds = []
        scores = []
        for seed in SEED:
            kf = StratifiedKFold(n_splits=BASE_FOLDS,
                                 shuffle=True,
                                 random_state=seed)
            for train_index, valid_index in kf.split(X_train_all, qcut_target):
                X_train, X_valid = (X_train_all.iloc[train_index, :],
                                    X_train_all.iloc[valid_index, :])
                y_train, y_valid = (y_train_all.iloc[train_index],
                                    y_train_all.iloc[valid_index])
                if name == "LightGBM":
                    model = LightGBM()
                elif name == "LinearRegression":
                    model = LinearRegressionWrapper()
                elif name == "Lasso":
                    model = LassoWrapper()
                elif name == "Ridge":
                    model = RidgeWrapper()
                elif name == "ElasticNet":
                    model = ElasticNetWrapper()
                elif name == "KernelRidge":
                    model = KernelRidgeWrapper()
                elif name == "SVR":
                    model = SVRWrapper()
                elif name == "XGBoost":
                    model = XGBoost()
                elif name == "RandomForest":
                    model = RandomForestWrapper()
                elif name == "GradientBoosting":
                    model = GradientBoostingRegressorWrapper()
                elif name == "CatBoost":
                    model = CatBoost()

                y_pred, y_valid_pred, m = model.train_and_predict(
                    X_train, X_valid, y_train, y_valid, X_test,
                    one_config["params"])

                oof[valid_index, :] += y_valid_pred.reshape(
                    len(y_valid_pred), 1) / len(SEED)
                #y_preds += (y_pred / FOLDS)
                y_preds.append(y_pred)
                # スコア
                rmse_valid = evaluate_score(y_valid, y_valid_pred,
                                            config['loss'])
                logging.debug(f"\tmodel:{name}, score: {rmse_valid}")
                scores.append(rmse_valid)

        score = sum(scores) / len(scores)
        print('===CV scores===')
        print(f"\tmodel: {name}, scores: {scores}")
        print(f"\tmodel: {name}, score: {score}")
        logging.debug('===CV scores===')
        logging.debug(f"\tmodel: {name}, scores: {scores}")
        logging.debug(f"\tmodel: {name}, score: {score}")

        oof_df[name] = oof
        y_preds_df[name] = sum(y_preds) / len(y_preds)

    # submitファイルの作成
    ID_name = config['ID_name']
    sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name])

    y_sub = y_preds_df.mean(axis=1)

    # 最後に、予測結果に対しexp(y)-1で逆変換
    y_sub = np.exp(y_sub) - 1  # np.expm1() でもOK

    sub[target_name] = y_sub

    sub.to_csv('./data/output/sub_blend.csv', index=False)

    # meta model の学習
    # use_features_in_secondary = True
    oof_df = pd.concat([X_train_all, oof_df], axis=1)
    y_preds_df = pd.concat([X_test, y_preds_df], axis=1)

    y_preds = []
    scores = []
    for seed in SEED:
        kf = StratifiedKFold(n_splits=META_FOLDS,
                             shuffle=True,
                             random_state=seed)
        for train_index, valid_index in kf.split(X_train_all, qcut_target):
            X_train, X_valid = (oof_df.iloc[train_index, :],
                                oof_df.iloc[valid_index, :])
            y_train, y_valid = (y_train_all.iloc[train_index],
                                y_train_all.iloc[valid_index])
            name = config['meta_model']
            if name == "LightGBM":
                model = LightGBM()
            elif name == "LinearRegression":
                model = LinearRegressionWrapper()
            elif name == "Lasso":
                model = LassoWrapper()
            elif name == "Ridge":
                model = RidgeWrapper()
            elif name == "ElasticNet":
                model = ElasticNetWrapper()
            elif name == "KernelRidge":
                model = KernelRidgeWrapper()
            elif name == "SVR":
                model = SVRWrapper()
            elif name == "XGBoost":
                model = XGBoost()
            elif name == "RandomForest":
                model = RandomForestWrapper()
            elif name == "GradientBoosting":
                model = GradientBoostingRegressorWrapper()
            elif name == "CatBoost":
                model = CatBoost()

            # 学習と推論。 y_preds_df を X_test に使用する
            y_pred, y_valid_pred, m = model.train_and_predict(
                X_train, X_valid, y_train, y_valid, y_preds_df, params)

            # 結果の保存
            y_preds.append(y_pred)

            # スコア
            rmse_valid = evaluate_score(y_valid, y_valid_pred, config['loss'])
            logging.debug(f"\tscore: {rmse_valid}")
            scores.append(rmse_valid)
    score = sum(scores) / len(scores)
    print('===CV scores===')
    print(scores)
    print(score)
    logging.debug('===CV scores===')
    logging.debug(scores)
    logging.debug(score)

    # submitファイルの作成
    ID_name = config['ID_name']
    sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name])

    y_sub = sum(y_preds) / len(y_preds)

    # 最後に、予測結果に対しexp(y)-1で逆変換
    y_sub = np.exp(y_sub) - 1  # np.expm1() でもOK

    sub[target_name] = y_sub

    sub.to_csv('./data/output/sub_{0}_{1:%Y%m%d%H%M%S}_{2}.csv'.format(
        config['model'], now, score),
               index=False)