def _dispatch_gbdt_class(algorithm_type: str, type_of_target: str): is_regression = type_of_target == 'continuous' if algorithm_type == 'lgbm': requires_lightgbm() from lightgbm import LGBMClassifier, LGBMRegressor return LGBMRegressor if is_regression else LGBMClassifier elif algorithm_type == 'cat': requires_catboost() from catboost import CatBoostClassifier, CatBoostRegressor return CatBoostRegressor if is_regression else CatBoostClassifier else: requires_xgboost() assert algorithm_type == 'xgb' from xgboost import XGBClassifier, XGBRegressor return XGBRegressor if is_regression else XGBClassifier
def adversarial_validate(X_train: pd.DataFrame, X_test: pd.DataFrame, importance_type: str = 'gain', estimator: Optional[BaseEstimator] = None, cat_cols = None, cv = None) -> ADVResult: """ Perform adversarial validation between X_train and X_test. Args: X_train: Training data X_test: Test data importance_type: The type of feature importance calculated. estimator: The custom estimator. If None, LGBMClassifier is automatically used. cv: Cross validation split. If ``None``, the first fold out of 5 fold is used as validation. Returns: Namedtuple with following members * auc: float, ROC AUC score of adversarial validation. * importance: pandas DataFrame, feature importance of adversarial model (order by importance) Example: >>> from sklearn.model_selection import train_test_split >>> from nyaggle.testing import make_regression_df >>> from nyaggle.validation import adversarial_validate >>> X, y = make_regression_df(n_samples=8) >>> X_train, X_test, y_train, y_test = train_test_split(X, y) >>> auc, importance = cross_validate(X_train, X_test) >>> >>> print(auc) 0.51078231 >>> importance.head() feature importance col_1 231.5827204 col_5 207.1837266 col_7 188.6920685 col_4 174.5668498 col_9 170.6438643 """ concat = pd.concat([X_train, X_test]).copy().reset_index(drop=True) y = np.array([1]*len(X_train) + [0]*len(X_test)) if estimator is None: requires_lightgbm() from lightgbm import LGBMClassifier estimator = LGBMClassifier(n_estimators=10000, objective='binary', importance_type=importance_type, random_state=0) else: assert is_instance(estimator, ('lightgbm.sklearn.LGBMModel', 'catboost.core.CatBoost')), \ 'Only CatBoostClassifier or LGBMClassifier is allowed' if cv is None: cv = Take(1, KFold(5, shuffle=True, random_state=0)) fit_params = {'verbose': -1} if cat_cols: fit_params['categorical_feature'] = cat_cols result = cross_validate(estimator, concat, y, None, cv=cv, eval_func=roc_auc_score, fit_params=fit_params, importance_type=importance_type) importance = pd.concat(result.importance) importance = importance.groupby('feature')['importance'].mean().reset_index() importance.sort_values(by='importance', ascending=False, inplace=True) importance.reset_index(drop=True, inplace=True) return ADVResult(result.scores[-1], importance)