Exemplo n.º 1
0
def find_hyperopt(df_train: pd.DataFrame, folds: pd.DataFrame) -> Dict:
    log = logging.getLogger(__name__)
    cols_all, col_target = get_cols(df_train)

    results = {}
    space = {
        'num_leaves': scope.int(hp.quniform('num_leaves', 3, 100, 1)),
        'max_depth': scope.int(hp.quniform('max_depth', 10, 70, 1)),
        'min_data_in_leaf':
        scope.int(hp.quniform('min_data_in_leaf', 5, 150, 1)),
        'feature_fraction': hp.uniform('feature_fraction', 0.85, 1.0),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.85, 1.0),
        'min_sum_hessian_in_leaf': hp.loguniform('min_sum_hessian_in_leaf', 0,
                                                 2.3),
        'lambda_l1': hp.uniform('lambda_l1', 1e-4, 2),
        'lambda_l2': hp.uniform('lambda_l2', 1e-4, 2),
        'seed': random_state,
        'feature_fraction_seed': random_state,
        'bagging_seed': random_state,
        'drop_seed': random_state,
        'data_random_seed': random_state,
        'verbose': -1,
        'bagging_freq': 5,
        'max_bin': 255,
        'learning_rate': 0.001,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
    }
    for col in col_target:
        cols_all, _ = get_cols(df_train, col)

        def score(params):
            cv_score = CV_score(params=params,
                                cols_all=cols_all,
                                col_target=col,
                                num_boost_round=99999999,
                                early_stopping_rounds=50,
                                valid=True)
            return cv_score.fit(df=df_train, folds=folds)

        trials = Trials()
        best = fmin(fn=score,
                    space=space,
                    algo=tpe.suggest,
                    trials=trials,
                    max_evals=max_evals)
        results[col] = space_eval(space, best)
    return results
Exemplo n.º 2
0
def cross_validation(df_train: pd.DataFrame, folds: pd.DataFrame, best=None):
    log = logging.getLogger(__name__)

    results = {}
    models = {}
    avg = 0
    for col in col_target:
        cols_all, _ = get_cols(df_train, col)
        if best:
            params = best[col]
        else:
            params = params_lgb

        log.info(f"CROSS VALIDATION COLUMN {col}")
        cv_score = CV_score(params=params,
                            cols_all=cols_all,
                            col_target=col,
                            num_boost_round=999999,
                            early_stopping_rounds=50,
                            valid=True)

        results[col] = cv_score.fit(df=df_train, folds=folds)
        avg += results[col]["score_max"] / len(col_target)
        models[col] = cv_score
    results['AVG'] = avg
    return [results, models]
Exemplo n.º 3
0
def cross_validation_test(df_test: pd.DataFrame, crossval_models) -> Dict:
    log = logging.getLogger(__name__)
    cols_all, _ = get_cols(df_test)

    results = pd.DataFrame(index=df_test.index)
    for col in col_target:
        cv_score = crossval_models[col]
        results[col] = cv_score.transform_test(df_test)

    return [results]
Exemplo n.º 4
0
def cross_validation_train(df_train: pd.DataFrame, folds: pd.DataFrame,
                           crossval_models):
    log = logging.getLogger(__name__)

    results = pd.DataFrame(index=df_train.index)
    for col in col_target:
        cols_all, _ = get_cols(df_train, col)
        cv_score = crossval_models[col]
        results[col] = cv_score.transform_train(df=df_train, folds=folds)
    return results
Exemplo n.º 5
0
def cross_validation_shap(df_train: pd.DataFrame, folds: pd.DataFrame,
                          crossval_models):
    log = logging.getLogger(__name__)

    results_shap_reg = {}
    results_shap_sum = {}
    shap_feature_stats = {}
    for col in col_target:
        cols_all, _ = get_cols(df_train, col)
        log.info(f'SHAP {col} START')
        cv_score = crossval_models[col]
        results_shap_reg[col], shap_feature_stats[col] = cv_score.shap(
            df=df_train, folds=folds)
        # results_shap_sum[col] = cv_score.shap_summary_plot(df=df_train, folds=folds)

    return [results_shap_reg, shap_feature_stats]
Exemplo n.º 6
0
def target_enc(df_train: pd.DataFrame, df_test: pd.DataFrame):
    log = logging.getLogger(__name__)
    cols_all, cols_target = get_cols(df_train)

    enc = WOE()
    for col in cols_target:
        a = enc.fit_transform(df_train[cols_all], df_train[col])
        a.columns = a.columns.map(lambda x: x + '_encoding_' + col)

        b = a.T.apply(lambda x: 1 - 2 * roc_auc_score(df_train[col], x),
                      axis=1)
        merge_cols = b[b > 0.04].index
        print(merge_cols)
        df_train = pd.concat([df_train, a[merge_cols]], axis=1)
        a = enc.transform(df_test[cols_all])
        a.columns = a.columns.map(lambda x: x + '_encoding_' + col)
        df_test = pd.concat([df_test, a[merge_cols]], axis=1)

    return [df_train, df_test]