示例#1
0
def main():

    logger.debug('config: {}'.format(options.config))
    logger.debug(feats)
    logger.debug(model_params)
    # 指定した特徴量からデータをロード
    X_train_all, X_test = load_datasets(feats)
    y_train_all = load_target(target_name)
    cols = X_train_all.columns

    # stacking
    if "stacking" in config and config["stacking"] == True:
        oof_df, test_df = stack_load_df(config["stacking_name"])
        X_train_all = pd.concat([X_train_all, oof_df], axis=1)
        X_test = pd.concat([X_test, test_df], axis=1)

    if (model_name != "lightgbm") or ("sampling" in config):
        logger.debug("rank gauss")
        scaler = QuantileTransformer(n_quantiles=100,
                                     random_state=model_params["seed"],
                                     output_distribution="normal")
        all_df = pd.concat([X_train_all, X_test])
        all_df = all_df.fillna(all_df.median())  # 欠損値埋め
        all_df[cols] = scaler.fit_transform(all_df[cols])  # scale
        X_train_all = all_df[:X_train_all.shape[0]].reset_index(drop=True)
        X_test = all_df[X_train_all.shape[0]:].reset_index(drop=True)

    logger.debug("X_train_all shape: {}".format(X_train_all.shape))
    print(X_train_all.info())

    # seed ごとにループ
    class_cols = [i for i in range(model_params["num_class"])]
    oof_df = pd.DataFrame(index=[i for i in range(X_train_all.shape[0])],
                          columns=class_cols)
    sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name])
    oof_df[class_cols] = 0
    sub[target_name] = 0
    for seed_num in range(config["seed_num"]):
        logger.debug(f"SEED: {seed_num}")
        one_oof_df, one_sub = train_and_predict(X_train_all,
                                                y_train_all,
                                                X_test,
                                                seed_num=seed_num)
        oof_df[class_cols] += one_oof_df[class_cols] / config["seed_num"]
        sub[target_name] += one_sub[target_name] / config["seed_num"]

    auc_score = evaluate_score(y_train_all.values, oof_df.values[:, 1], "auc")
    acc_score = evaluate_score(y_train_all.values,
                               oof_df.values.argmax(axis=1), "acc")
    logloss_score = evaluate_score(y_train_all.values, oof_df.values[:, 1],
                                   "logloss")
    logger.debug('=== OOF CV scores ===')
    logger.debug(
        f"\t auc:{auc_score}, acc: {acc_score}, logloss: {logloss_score}")

    sub = sub.rename(columns={ID_name: 'Id', target_name: "label"})
    oof_df.to_csv(f'./data/output/oof_{config_filename}.csv', index=False)
    sub.to_csv(f'./data/output/sub_{config_filename}.csv', index=False)
示例#2
0
def train_and_predict_linear(X_train_all, y_train_all, X_test):
    qcut_target = pd.qcut(y_train_all, SK_NUM, labels=False)

    # 学習前にy_trainに、log(y+1)で変換
    y_train_all = np.log(y_train_all + 1)  # np.log1p() でもOK

    y_preds = []
    scores = []  # CVスコア
    for seed in SEED:
        kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed)
        for train_index, valid_index in kf.split(X_train_all, qcut_target):
            X_train, X_valid = (X_train_all.iloc[train_index, :],
                                X_train_all.iloc[valid_index, :])
            y_train, y_valid = (y_train_all.iloc[train_index],
                                y_train_all.iloc[valid_index])

            if config['model'] == "LinearRegression":
                lr = LinearRegressionWrapper()
            elif config['model'] == "Lasso":
                lr = LassoWrapper()
            elif config['model'] == "Ridge":
                lr = RidgeWrapper()
            elif config['model'] == "ElasticNet":
                lr = ElasticNetWrapper()
            elif config['model'] == "KernelRidge":
                lr = KernelRidgeWrapper()
            elif config['model'] == "SVR":
                lr = SVRWrapper()
            elif config['model'] == "XGBoost":
                lr = XGBoost()
            elif config['model'] == "RandomForest":
                lr = RandomForestWrapper()
            elif config['model'] == "GradientBoosting":
                lr = GradientBoostingRegressorWrapper()
            elif config['model'] == "CatBoost":
                lr = CatBoost()
            y_pred, y_valid_pred, m = lr.train_and_predict(
                X_train, X_valid, y_train, y_valid, X_test, params)

            # 結果の保存
            y_preds.append(y_pred)

            # スコア
            rmse_valid = evaluate_score(y_valid, y_valid_pred, config['loss'])
            logging.debug(f"\tscore: {rmse_valid}")
            scores.append(rmse_valid)

    score = sum(scores) / len(scores)
    print('===CV scores===')
    print(scores)
    print(score)
    logging.debug('===CV scores===')
    logging.debug(scores)
    logging.debug(score)

    # submitファイルの作成
    ID_name = config['ID_name']
    sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name])

    y_sub = sum(y_preds) / len(y_preds)

    # 最後に、予測結果に対しexp(y)-1で逆変換
    y_sub = np.exp(y_sub) - 1  # np.expm1() でもOK

    sub[target_name] = y_sub

    sub.to_csv('./data/output/sub_{0}_{1:%Y%m%d%H%M%S}_{2}.csv'.format(
        config['model'], now, score),
               index=False)
示例#3
0
def train_and_predict(X_train_all, y_train_all, X_test, seed_num):
    model_params["seed"] = seed + seed_num
    oof_df = pd.DataFrame(
        index=[i for i in range(X_train_all.shape[0])],
        columns=[i for i in range(model_params["num_class"])])
    y_preds = []

    models = []
    auc_scores = []
    acc_scores = []
    logloss_scores = []

    kf = StratifiedKFold(n_splits=config["fold"],
                         shuffle=True,
                         random_state=model_params["seed"])
    for fold_num, (train_index,
                   valid_index) in enumerate(kf.split(X_train_all,
                                                      y_train_all)):
        logger.debug(f"FOLD: {fold_num}")
        X_train, X_valid = (X_train_all.iloc[train_index, :],
                            X_train_all.iloc[valid_index, :])
        y_train, y_valid = (y_train_all.iloc[train_index],
                            y_train_all.iloc[valid_index])

        # train & inference
        if model_name == "lightgbm":
            classifier = LightGBM()
        elif model_name == "nn":
            classifier = NeuralNet(seed_num, fold_num)
        elif model_name == "cnn1d":
            classifier = CNN1d(seed_num, fold_num)
        elif model_name == "logistic_regression":
            classifier = LogisticRegressionClassifier()
        else:
            logger.debug("No such model name")
            raise Exception

        if "sampling" in config:
            if config["sampling"] == "SMOTE":
                X_train, y_train = SMOTE().fit_resample(X_train, y_train)
            elif config["sampling"] == "ADASYN":
                X_train, y_train = ADASYN().fit_resample(X_train, y_train)
            elif config["sampling"] == "RandomOverSampler":
                X_train, y_train = RandomOverSampler().fit_resample(
                    X_train, y_train)
            else:
                raise

        y_pred, y_valid_pred, model = classifier.train_and_predict(
            X_train, X_valid, y_train, y_valid, X_test, model_params)

        # 結果の保存
        y_preds.append(y_pred)
        oof_df.iloc[valid_index, :] = y_valid_pred
        models.append(model)

        # スコア
        auc_valid = evaluate_score(y_valid, y_valid_pred[:, 1], "auc")
        acc_valid = evaluate_score(y_valid, y_valid_pred.argmax(axis=1), "acc")
        logloss_valid = evaluate_score(y_valid, y_valid_pred[:, 1], "logloss")
        logger.debug(
            f"\t auc:{auc_valid}, acc: {acc_valid}, logloss: {logloss_valid}")
        auc_scores.append(auc_valid)
        acc_scores.append(acc_valid)
        logloss_scores.append(logloss_valid)

    # lightgbmなら重要度の出力
    if model_name == "lightgbm":
        feature_imp_np = np.zeros(X_train_all.shape[1])
        for model in models:
            feature_imp_np += model.feature_importance() / len(models)
        feature_imp = pd.DataFrame(sorted(
            zip(feature_imp_np, X_train_all.columns)),
                                   columns=['Value', 'Feature'])
        #print(feature_imp)
        logger.debug(feature_imp)
        plt.figure(figsize=(20, 10))
        sns.barplot(x="Value",
                    y="Feature",
                    data=feature_imp.sort_values(by="Value", ascending=False))
        plt.title('LightGBM Features (avg over folds)')
        plt.tight_layout()
        plt.savefig(f'./logs/plots/features_{config_filename}.png')

    # CVスコア
    auc_score = sum(auc_scores) / len(auc_scores)
    acc_score = sum(acc_scores) / len(acc_scores)
    logloss_score = sum(logloss_scores) / len(logloss_scores)
    logger.debug('=== CV scores ===')
    logger.debug(
        f"\t auc:{auc_score}, acc: {acc_score}, logloss: {logloss_score}")

    # submitファイルの作成
    sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name])
    y_sub = sum(y_preds) / len(y_preds)
    sub[target_name] = y_sub[:, 1]
    ''' 確率ではなく番号を出力
    if y_sub.shape[1] > 1:
        y_sub = np.argmax(y_sub, axis=1)
    '''

    return oof_df, sub
示例#4
0
def stacking(X_train_all, y_train_all, X_test):
    qcut_target = pd.qcut(y_train_all, SK_NUM, labels=False)

    print(qcut_target)
    # 学習前にy_trainに、log(y+1)で変換
    y_train_all = np.log(y_train_all + 1)  # np.log1p() でもOK

    # base model の学習
    base_models = config['base_models']
    # 行数を揃えた空のデータフレームを作成
    oof_df = pd.DataFrame(index=[i for i in range(X_train_all.shape[0])
                                 ])  # meta model の X_train に
    y_preds_df = pd.DataFrame(index=[i for i in range(X_test.shape[0])
                                     ])  # meta model の X_test に

    # base model ごとにK-fold して学習
    for name, json_name in base_models.items():
        one_config = json.load(open(f"./configs/{json_name}"))

        oof = np.zeros((X_train_all.shape[0], 1))
        #y_preds = np.zeros((X_test.shape[0], 1))
        y_preds = []
        scores = []
        for seed in SEED:
            kf = StratifiedKFold(n_splits=BASE_FOLDS,
                                 shuffle=True,
                                 random_state=seed)
            for train_index, valid_index in kf.split(X_train_all, qcut_target):
                X_train, X_valid = (X_train_all.iloc[train_index, :],
                                    X_train_all.iloc[valid_index, :])
                y_train, y_valid = (y_train_all.iloc[train_index],
                                    y_train_all.iloc[valid_index])
                if name == "LightGBM":
                    model = LightGBM()
                elif name == "LinearRegression":
                    model = LinearRegressionWrapper()
                elif name == "Lasso":
                    model = LassoWrapper()
                elif name == "Ridge":
                    model = RidgeWrapper()
                elif name == "ElasticNet":
                    model = ElasticNetWrapper()
                elif name == "KernelRidge":
                    model = KernelRidgeWrapper()
                elif name == "SVR":
                    model = SVRWrapper()
                elif name == "XGBoost":
                    model = XGBoost()
                elif name == "RandomForest":
                    model = RandomForestWrapper()
                elif name == "GradientBoosting":
                    model = GradientBoostingRegressorWrapper()
                elif name == "CatBoost":
                    model = CatBoost()

                y_pred, y_valid_pred, m = model.train_and_predict(
                    X_train, X_valid, y_train, y_valid, X_test,
                    one_config["params"])

                oof[valid_index, :] += y_valid_pred.reshape(
                    len(y_valid_pred), 1) / len(SEED)
                #y_preds += (y_pred / FOLDS)
                y_preds.append(y_pred)
                # スコア
                rmse_valid = evaluate_score(y_valid, y_valid_pred,
                                            config['loss'])
                logging.debug(f"\tmodel:{name}, score: {rmse_valid}")
                scores.append(rmse_valid)

        score = sum(scores) / len(scores)
        print('===CV scores===')
        print(f"\tmodel: {name}, scores: {scores}")
        print(f"\tmodel: {name}, score: {score}")
        logging.debug('===CV scores===')
        logging.debug(f"\tmodel: {name}, scores: {scores}")
        logging.debug(f"\tmodel: {name}, score: {score}")

        oof_df[name] = oof
        y_preds_df[name] = sum(y_preds) / len(y_preds)

    # submitファイルの作成
    ID_name = config['ID_name']
    sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name])

    y_sub = y_preds_df.mean(axis=1)

    # 最後に、予測結果に対しexp(y)-1で逆変換
    y_sub = np.exp(y_sub) - 1  # np.expm1() でもOK

    sub[target_name] = y_sub

    sub.to_csv('./data/output/sub_blend.csv', index=False)

    # meta model の学習
    # use_features_in_secondary = True
    oof_df = pd.concat([X_train_all, oof_df], axis=1)
    y_preds_df = pd.concat([X_test, y_preds_df], axis=1)

    y_preds = []
    scores = []
    for seed in SEED:
        kf = StratifiedKFold(n_splits=META_FOLDS,
                             shuffle=True,
                             random_state=seed)
        for train_index, valid_index in kf.split(X_train_all, qcut_target):
            X_train, X_valid = (oof_df.iloc[train_index, :],
                                oof_df.iloc[valid_index, :])
            y_train, y_valid = (y_train_all.iloc[train_index],
                                y_train_all.iloc[valid_index])
            name = config['meta_model']
            if name == "LightGBM":
                model = LightGBM()
            elif name == "LinearRegression":
                model = LinearRegressionWrapper()
            elif name == "Lasso":
                model = LassoWrapper()
            elif name == "Ridge":
                model = RidgeWrapper()
            elif name == "ElasticNet":
                model = ElasticNetWrapper()
            elif name == "KernelRidge":
                model = KernelRidgeWrapper()
            elif name == "SVR":
                model = SVRWrapper()
            elif name == "XGBoost":
                model = XGBoost()
            elif name == "RandomForest":
                model = RandomForestWrapper()
            elif name == "GradientBoosting":
                model = GradientBoostingRegressorWrapper()
            elif name == "CatBoost":
                model = CatBoost()

            # 学習と推論。 y_preds_df を X_test に使用する
            y_pred, y_valid_pred, m = model.train_and_predict(
                X_train, X_valid, y_train, y_valid, y_preds_df, params)

            # 結果の保存
            y_preds.append(y_pred)

            # スコア
            rmse_valid = evaluate_score(y_valid, y_valid_pred, config['loss'])
            logging.debug(f"\tscore: {rmse_valid}")
            scores.append(rmse_valid)
    score = sum(scores) / len(scores)
    print('===CV scores===')
    print(scores)
    print(score)
    logging.debug('===CV scores===')
    logging.debug(scores)
    logging.debug(score)

    # submitファイルの作成
    ID_name = config['ID_name']
    sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name])

    y_sub = sum(y_preds) / len(y_preds)

    # 最後に、予測結果に対しexp(y)-1で逆変換
    y_sub = np.exp(y_sub) - 1  # np.expm1() でもOK

    sub[target_name] = y_sub

    sub.to_csv('./data/output/sub_{0}_{1:%Y%m%d%H%M%S}_{2}.csv'.format(
        config['model'], now, score),
               index=False)