예제 #1
0
def hold_out_lgb_validation(X, y, params, eval_metric='mae', columns=None,
                           plot_feature_importance=False,
                           verbose=10000, early_stopping_rounds=200, n_estimators=50000, ):
    columns = X.columns if columns is None else columns

    # to set up scoring parameters
    metrics_dict = {'mae': {'lgb_metric_name': 'mae',
                            'catboost_metric_name': 'MAE',
                            'sklearn_scoring_function': metrics.mean_absolute_error},
                    'group_mae': {'lgb_metric_name': 'mae',
                                  'catboost_metric_name': 'MAE',
                                  'scoring_function': group_mean_log_mae},
                    'mse': {'lgb_metric_name': 'mse',
                            'catboost_metric_name': 'MSE',
                            'sklearn_scoring_function': metrics.mean_squared_error}
                    }

    result_dict = {}

    X_train, X_valid, y_train, y_valid = train_test_split(X[columns], y, test_size=0.1, random_state=42)
    eval_result = {}
    callbacks = [lgb.record_evaluation(eval_result)]
    model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
              verbose=verbose, early_stopping_rounds=early_stopping_rounds,
              callbacks=callbacks)

    y_pred_valid = model.predict(X_valid)

    if eval_metric != 'group_mae':
        score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
    else:
        score = metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type'])


    if plot_feature_importance:
        # feature importance
        feature_importance = pd.DataFrame()
        feature_importance["feature"] = columns
        feature_importance["importance"] = model.feature_importances_
    else:
        feature_importance = None

    try:
        cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' + f'HOLD_OUT score: {score:.4f} .'
        print(cv_score_msg)
        send_message(cv_score_msg)
    except Exception as e:
        print(e)
        pass

    result_dict["model"] = model
    result_dict['y_pred_valid'] = pd.DataFrame(y_pred_valid, index=X_valid.index, columns=["scalar_coupling_constant"])
    result_dict['score'] = score
    result_dict["importance"] = feature_importance
    result_dict["eval_result"] = eval_result
    return result_dict
def hold_out_lgb_validation(
    X_train,
    X_valid,
    y_train,
    y_valid,
    params,
    columns=None,
    plot_feature_importance=False,
    verbose=10000,
    early_stopping_rounds=200,
    n_estimators=50000,
):
    columns = X.columns if columns is None else columns

    result_dict = {}
    model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs=-1)
    model.fit(X_train,
              y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="auc",
              verbose=verbose,
              early_stopping_rounds=early_stopping_rounds)

    y_pred_valid = model.predict(X_valid)

    score = roc_auc_score(y_valid, y_pred_valid)

    if plot_feature_importance:
        # feature importance
        feature_importance = pd.DataFrame()
        feature_importance["feature"] = columns
        feature_importance["importance"] = model.feature_importances_
    else:
        feature_importance = None

    try:
        cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' + f'HOLD_OUT score: {score:.4f} .'
        print(cv_score_msg)
        send_message(cv_score_msg)
    except Exception as e:
        print(e)
        pass

    result_dict["model"] = model
    result_dict['y_pred_valid'] = pd.DataFrame(
        y_pred_valid,
        index=X_valid.index,
        columns=["scalar_coupling_constant"])
    result_dict['score'] = score
    result_dict["importance"] = feature_importance

    return result_dict
예제 #3
0
def train_model_regression(X,
                           X_test,
                           y,
                           params,
                           folds,
                           model_type='lgb',
                           eval_metric='mae',
                           columns=None,
                           plot_feature_importance=False,
                           model=None,
                           verbose=10000,
                           early_stopping_rounds=200,
                           n_estimators=50000,
                           mol_type=-1,
                           fold_group=None):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.

    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type

    """
    columns = X.columns if columns is None else columns
    X_test = X_test[columns]

    # to set up scoring parameters
    metrics_dict = {
        'mae': {
            'lgb_metric_name': 'mae',
            'catboost_metric_name': 'MAE',
            'sklearn_scoring_function': metrics.mean_absolute_error
        },
        'group_mae': {
            'lgb_metric_name': 'mae',
            'catboost_metric_name': 'MAE',
            'scoring_function': group_mean_log_mae
        },
        'mse': {
            'lgb_metric_name': 'mse',
            'catboost_metric_name': 'MSE',
            'sklearn_scoring_function': metrics.mean_squared_error
        }
    }

    result_dict = {}

    # out-of-fold predictions on train data
    oof = np.zeros(len(X))

    # averaged predictions on train data
    prediction = np.zeros(len(X_test))

    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    model_list = []

    # split and train on folds
    for fold_n, (train_index,
                 valid_index) in enumerate(folds.split(X, groups=fold_group)):
        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[
                valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params,
                                      n_estimators=n_estimators,
                                      n_jobs=-1,
                                      importance_type='gain')
            print(model)
            model.fit(X_train,
                      y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                      verbose=verbose,
                      early_stopping_rounds=early_stopping_rounds)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train,
                                     label=y_train,
                                     feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid,
                                     label=y_valid,
                                     feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            params["objective"] = "reg:linear"
            params["eval_metric"] = metrics_dict[eval_metric][
                'lgb_metric_name']
            model = xgb.train(dtrain=train_data,
                              num_boost_round=20000,
                              evals=watchlist,
                              early_stopping_rounds=200,
                              verbose_eval=verbose,
                              params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid,
                                                     feature_names=X.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test,
                                               feature_names=X.columns),
                                   ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)

            y_pred_valid = model.predict(X_valid).reshape(-1, )
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](
                y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')

            y_pred = model.predict(X_test).reshape(-1, )

        if model_type == 'cat':
            model = CatBoostRegressor(
                iterations=20000,
                eval_metric=metrics_dict[eval_metric]['catboost_metric_name'],
                **params,
                loss_function=metrics_dict[eval_metric]
                ['catboost_metric_name'])
            model.fit(X_train,
                      y_train,
                      eval_set=(X_valid, y_valid),
                      cat_features=[],
                      use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)

        oof[valid_index] = y_pred_valid.reshape(-1, )
        if eval_metric != 'group_mae':
            scores.append(
                metrics_dict[eval_metric]['sklearn_scoring_function'](
                    y_valid, y_pred_valid))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](
                y_valid, y_pred_valid, X_valid['type']))

        prediction += y_pred

        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)
        model_list += [model]

    if model_type == 'lgb' and plot_feature_importance:
        result_dict['importance'] = feature_importance

    prediction /= folds.n_splits
    try:
        cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' + ' CV mean score: {0:.4f}, std: {1:.4f}.'.format(
            np.mean(scores), np.std(scores))
        print(cv_score_msg)
        send_message(cv_score_msg)
    except Exception as e:
        print(e)
        pass

    result_dict["models"] = model_list
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores

    # if model_type == 'lgb':
    #     if plot_feature_importance:
    #         feature_importance["importance"] /= folds.n_splits
    #         cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
    #             by="importance", ascending=False)[:50].index
    #
    #         best_features = feature_importance.loc[feature_importance.feature.isin(cols)]
    #
    #         plt.figure(figsize=(16, 12));
    #         sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
    #         plt.title('LGB Features (avg over folds)');
    #         feature_importance.to_csv(log_path/f"importance_{mol_type}.csv")
    #

    return result_dict
예제 #4
0
        importance_path = log_path / f'importance_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.csv'
        result_dict["importance"].to_csv(importance_path, index=True)

    for t, s in zip(X['type'].unique(), score_list):
        print(f"type {t}, score: {s:0.5f}")

    if TRAIN_ALL_DATA or CV_FOLD:
        #########################################################################################################
        # create oof & submission file.
        sub = pd.read_csv(f'../input/sample_submission.csv')
        sub['scalar_coupling_constant'] = X_short_test['prediction']
        sub.to_csv(submit_path /
                   f'submission_t_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv',
                   index=False)
        print(sub.head())
        send_message(f"finish all_data train_{DATA_VERSION}_{TRIAL_NO}_{seed}")

    if CV_FOLD:
        oof_log_mae = group_mean_log_mae(X_short['target'],
                                         X_short['oof'],
                                         X_short['type'],
                                         floor=1e-9)
        print(f"oof_log_mae: {oof_log_mae}")

        df_oof = pd.DataFrame(index=train.id)
        df_oof["scalar_coupling_constant"] = X_short['oof']
        df_oof.to_csv(submit_path /
                      f'oof_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv',
                      index=True)
        send_message(
            f"finish train_{DATA_VERSION}_{TRIAL_NO}_{seed}, oof_log_mae: {oof_log_mae}"
예제 #5
0
                                              verbose=500,
                                              early_stopping_rounds=200,
                                              n_estimators=15000,
                                              mol_type=t)
    X_short.loc[X_short['type'] == t, 'oof'] = result_dict_lgb3['oof']
    X_short_test.loc[X_short_test['type'] == t,
                     'prediction'] = result_dict_lgb3['prediction']

    X_short.to_csv(submit_path / f"tmp_oof_{t}.csv")
    X_short_test.to_csv(submit_path / f"tmp_sub_{t}.csv")
    to_pickle(model_path / f"second_model_list_{DATA_VERSION}_{TRIAL_NO}.pkl",
              result_dict_lgb3["models"])

sub['scalar_coupling_constant'] = X_short_test['prediction']
sub.to_csv(submit_path / f'submission_t_{DATA_VERSION}_{TRIAL_NO}.csv',
           index=False)
print(sub.head())

oof_log_mae = group_mean_log_mae(X_short['target'],
                                 X_short['oof'],
                                 X_short['type'],
                                 floor=1e-9)
print(f"oof_log_mae: {oof_log_mae}")
print(f"finished. : {current_time()}")

df_oof = pd.DataFrame(index=train.id)
df_oof["scalar_coupling_constant"] = X_short['oof']
df_oof.to_csv(submit_path / f'oof_{DATA_VERSION}_{TRIAL_NO}.csv', index=True)

send_message(
    f"finish train_{DATA_VERSION}_{TRIAL_NO}, oof_log_mae: {oof_log_mae}")
예제 #6
0
def train_main(seed, type_):
    print(f"==================== seed: {seed} ====================")
    params = { #'num_leaves': 128,
              'min_child_samples': 79,
              'objective': 'regression',
              'max_depth': -1, #9,
              'learning_rate': 0.2,
              "boosting_type": "gbdt",
              "subsample_freq": 1,
              "subsample": 0.9,
              "metric": 'mae',
              "verbosity": -1,
              'reg_alpha': 0.1,
              'reg_lambda': 0.3,
              'colsample_bytree': 1.0,
              'num_threads' : -1,
             }

    params["seed"] = seed
    params["bagging_seed"] = seed + 1
    params["feature_fraction_seed"] = seed + 2

    n_estimators = 5  #10000
    params["num_leaves"] = 256
    if DEBUG:
        n_estimators = 5

    X_short = pd.DataFrame({
        'ind': list(X.index),
        'type': X['type'].values,
        'oof': [0] * len(X),
        'target': y.values,
        'fc': y_fc.values
    })

    X_short_test = pd.DataFrame({
        'ind': list(X_test.index),
        'type': X_test['type'].values,
        'prediction': [0] * len(X_test)
    })

    print(f'{current_time()} Training of type {type_} / {X["type"].unique()}')
    X_t = X.loc[X['type'] == type_]
    X_test_t = X_test.loc[X_test['type'] == type_]
    y_fc_t = X_short.loc[X_short['type'] == type_, 'fc']
    y_t = X_short.loc[X_short['type'] == type_, 'target']
    mol_name_t = mol_name.loc[X['type'] == type_][
        X_t.index] if GROUP_K_FOLD else None
    print(
        f"X_t.shape: {X_t.shape}, X_test_t.shape: {X_test_t.shape}, y_t.shape: {y_t.shape}"
    )

    ########################################################################################################
    # fc
    print("=" * 30 + " fc " + "=" * 30)
    result_dict_lgb1 = train_model_regression(X=X_t,
                                              X_test=X_test_t,
                                              y=y_fc_t,
                                              params=params,
                                              folds=folds,
                                              model_type='lgb',
                                              eval_metric='group_mae',
                                              plot_feature_importance=False,
                                              verbose=1000,
                                              early_stopping_rounds=200,
                                              n_estimators=n_estimators,
                                              fold_group=mol_name.values)

    X['oof_fc'] = result_dict_lgb1['oof']
    X_test['oof_fc'] = result_dict_lgb1['prediction']

    to_pickle(
        submit_path /
        f"train_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl",
        X['oof_fc'])
    to_pickle(
        submit_path /
        f"test_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl",
        X_test['oof_fc'])
    to_pickle(
        model_path /
        f"first_model_list_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl",
        result_dict_lgb1["models"])

    #########################################################################################################
    # 2nd layer model
    params["seed"] = seed + 3
    params["bagging_seed"] = seed + 4
    params["feature_fraction_seed"] = seed + 5
    params["num_leaves"] = 256  # num_leaves_dict[t]
    start_time = current_time()
    bairitsu = 256 / params["num_leaves"]
    n_estimators = 5  #int(15000 * bairitsu)

    if DEBUG:
        n_estimators = 5

    if TRAIN_ALL_DATA:
        print("============= 2nd layer TRIAN ALL DATA ================")
        result_dict = train_lgb_regression_alldata(
            X=X_t,
            X_test=X_test_t,
            y=y_t,
            params=params,
            eval_metric='group_mae',
            plot_feature_importance=True,
            verbose=5000,
            n_estimators=int(n_estimators * 1.6),
            mol_type=type_)

        X_short_test.loc[X_short_test['type'] == type_,
                         'prediction'] = result_dict['prediction']
        X_short_test.to_csv(
            submit_path / f"sub_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv")

    elif CV_FOLD:
        print("============= 2nd layer CV ================")
        result_dict = train_model_regression(X_t,
                                             X_test_t,
                                             y_t,
                                             params,
                                             folds,
                                             model_type='lgb',
                                             eval_metric='mae',
                                             columns=None,
                                             plot_feature_importance=True,
                                             model=None,
                                             verbose=1000,
                                             early_stopping_rounds=200,
                                             n_estimators=n_estimators,
                                             mol_type=-1,
                                             fold_group=mol_name_t)

        result_dict["start_time"] = start_time
        result_dict["n_estimator"] = n_estimators
        result_dict["X_t_len"] = X_t.shape[0]
        result_dict["type"] = type_
        result_dict["type_name"] = type_name[type_]

        X_short.loc[X_short['type'] == type_, 'oof'] = result_dict['oof']
        X_short.to_csv(submit_path /
                       f"oof_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv")

        X_short_test.loc[X_short_test['type'] == type_,
                         'prediction'] = result_dict['prediction']
        X_short_test.to_csv(
            submit_path / f"sub_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv")

    else:
        print("============= 2nd layer hold out ================")
        result_dict = hold_out_lgb_validation(X=X_t,
                                              y=y_t,
                                              params=params,
                                              eval_metric='mae',
                                              plot_feature_importance=True,
                                              verbose=5000,
                                              early_stopping_rounds=200,
                                              n_estimators=n_estimators)

        result_dict["start_time"] = start_time
        result_dict["n_estimator"] = n_estimators
        result_dict["X_t_len"] = X_t.shape[0]
        result_dict["type"] = type_
        result_dict["type_name"] = type_name[type_]

        eval_result: list = result_dict["eval_result"]["valid_1"]["l1"]
        training_log_df: pd.DataFrame = pd.DataFrame(
            eval_result, index=np.arange(len(eval_result)) + 1)
        training_log_df.columns = ["l1"]
        training_log_df.index.name = "iter"
        training_log_df.to_csv(
            log_path / f"train_log_{DATA_VERSION}_{TRIAL_NO}_{type_}.csv")

        to_pickle(
            model_path /
            f"hold_out_model_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl",
            result_dict["model"])
    #
    #
    #     to_pickle(log_path / f"result_dict_{type_}_{seed}.pkl", result_dict)
    #     importance_path = log_path / f'importance_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv'
    #     result_dict["importance"].to_csv(importance_path, index=True)
    #
    # for type_, s in zip(X['type'].unique(), score_list):
    #     print(f"type {type_}, score: {s:0.5f}")

    if TRAIN_ALL_DATA or CV_FOLD:
        #########################################################################################################
        # create oof & submission file.
        sub = pd.read_csv(f'../input/sample_submission.csv')
        sub['scalar_coupling_constant'] = X_short_test['prediction']
        sub.to_csv(submit_path /
                   f'submission_t_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv',
                   index=False)
        print(sub.head())
        send_message(f"finish all_data train_{DATA_VERSION}_{TRIAL_NO}_{seed}")

    if CV_FOLD:
        oof_log_mae = group_mean_log_mae(X_short['target'],
                                         X_short['oof'],
                                         X_short['type'],
                                         floor=1e-9)
        print(f"oof_log_mae: {oof_log_mae}")

        df_oof = pd.DataFrame(index=train.id)
        df_oof["scalar_coupling_constant"] = X_short['oof']
        df_oof.to_csv(submit_path /
                      f'oof_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv',
                      index=True)
        send_message(
            f"finish train_{DATA_VERSION}_{TRIAL_NO}_{seed}, oof_log_mae: {oof_log_mae}"
        )
예제 #7
0
    for t, s in zip(X_rgs['type'].unique(), score_list):
        print(f"type {t}, score: {s:0.5f}")

    sub = pd.read_csv(f'../input/sample_submission.csv')
    sub['scalar_coupling_constant'] = X_short_test['prediction']
    sub.to_csv(
        submit_path /
        f'submission_nn_{DATA_VERSION}_{TRIAL_NO}_{MODEL_TYPE}_{seed}.csv',
        index=False)
    print(sub.head())

    oof_log_mae = group_mean_log_mae(X_short['target'],
                                     X_short['oof'],
                                     X_short['type'],
                                     floor=1e-9)
    print(f"oof_log_mae: {oof_log_mae}")

    train_ids = pd.read_csv(f'../input/train.csv')["id"].values
    df_oof = pd.DataFrame(index=train_ids)
    df_oof["scalar_coupling_constant"] = X_short['oof']
    df_oof.to_csv(submit_path /
                  f'oof_nn_{DATA_VERSION}_{TRIAL_NO}_{MODEL_TYPE}_{seed}.csv',
                  index=True)

    if not DEBUG:
        send_message(
            f"{MODEL_TYPE}: finish train_{DATA_VERSION}_{TRIAL_NO}_{seed}, oof_log_mae: {oof_log_mae}"
        )

print(f"finished. : {current_time()}")
예제 #8
0
                eval_result, index=np.arange(len(eval_result)) + 1)
            training_log_df.columns = ["l1"]
            training_log_df.index.name = "iter"
            training_log_df.to_csv(
                log_path / f"train_log_{DATA_VERSION}_{TRIAL_NO}_{t}.csv")

            to_pickle(
                model_path /
                f"hold_out_model_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.pkl",
                result_dict["model"])
            result_dict['y_pred_valid'].to_csv(
                submit_path /
                f'holdout_pred_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.csv',
                index=True)
            to_pickle(
                log_path /
                f"result_dict_{t}_{params['num_leaves']}_{seed}.pkl",
                result_dict)

    if TRAIN_ALL_DATA:
        #########################################################################################################
        # create oof & submission file.
        sub = pd.read_csv(f'../input/sample_submission.csv')
        sub['scalar_coupling_constant'] = X_short_test['prediction']
        sub.to_csv(submit_path /
                   f'submission_t_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv',
                   index=False)
        print(sub.head())
        send_message(f"finish all_data train_{DATA_VERSION}_{TRIAL_NO}_{seed}")

print(f"finished. : {current_time()}")