pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name) if "Audit" in datasets: build_audit(DecisionTreeClassifier(min_samples_leaf = 2, random_state = 13), "DecisionTreeAudit", compact = False) build_audit(BaggingClassifier(DecisionTreeClassifier(random_state = 13, min_samples_leaf = 5), n_estimators = 3, max_features = 0.5, random_state = 13), "DecisionTreeEnsembleAudit") build_audit(DummyClassifier(strategy = "most_frequent"), "DummyAudit") build_audit(ExtraTreesClassifier(n_estimators = 10, min_samples_leaf = 5, random_state = 13), "ExtraTreesAudit") build_audit(GBDTLRClassifier(RandomForestClassifier(n_estimators = 17, random_state = 13), LogisticRegression(multi_class = "ovr", solver = "liblinear")), "GBDTLRAudit") build_audit(GBDTLRClassifier(XGBClassifier(n_estimators = 17, random_state = 13), LogisticRegression(multi_class = "ovr", solver = "liblinear")), "XGBLRAudit") build_audit(GBDTLRClassifier(XGBRFClassifier(n_estimators = 7, max_depth = 6, random_state = 13), SGDClassifier(loss = "log", penalty = "elasticnet", random_state = 13)), "XGBRFLRAudit") build_audit(GradientBoostingClassifier(loss = "exponential", init = None, random_state = 13), "GradientBoostingAudit") build_audit(HistGradientBoostingClassifier(max_iter = 71, random_state = 13), "HistGradientBoostingAudit") build_audit(LGBMClassifier(objective = "binary", n_estimators = 37), "LGBMAudit", predict_params = {"num_iteration" : 17}, predict_proba_params = {"num_iteration" : 17}, num_iteration = 17) build_audit(LinearDiscriminantAnalysis(solver = "lsqr"), "LinearDiscriminantAnalysisAudit") build_audit(LinearSVC(penalty = "l1", dual = False, random_state = 13), "LinearSVCAudit", with_proba = False) build_audit(LogisticRegression(multi_class = "multinomial", solver = "newton-cg", max_iter = 500), "MultinomialLogisticRegressionAudit") build_audit(LogisticRegressionCV(cv = 3, multi_class = "ovr"), "OvRLogisticRegressionAudit") build_audit(BaggingClassifier(LogisticRegression(multi_class = "ovr", solver = "liblinear"), n_estimators = 3, max_features = 0.5, random_state = 13), "LogisticRegressionEnsembleAudit") build_audit(GaussianNB(), "NaiveBayesAudit") build_audit(OneVsRestClassifier(LogisticRegression(multi_class = "ovr", solver = "liblinear")), "OneVsRestAudit") build_audit(RandomForestClassifier(n_estimators = 10, min_samples_leaf = 3, random_state = 13), "RandomForestAudit", flat = True) build_audit(RidgeClassifierCV(), "RidgeAudit", with_proba = False) build_audit(BaggingClassifier(RidgeClassifier(random_state = 13), n_estimators = 3, max_features = 0.5, random_state = 13), "RidgeEnsembleAudit") build_audit(StackingClassifier([("lda", LinearDiscriminantAnalysis(solver = "lsqr")), ("lr", LogisticRegression(multi_class = "ovr", solver = "liblinear"))], final_estimator = GradientBoostingClassifier(n_estimators = 11, random_state = 13)), "StackingEnsembleAudit") build_audit(SVC(gamma = "auto"), "SVCAudit", with_proba = False)
X_clouds_L = clouds_L.reshape((clouds_L.shape[0], -1)) X_clouds_scl_L = X_clouds_L / scale_rnd_L model_L_params = { 'colsample_bytree': 0.07, 'gamma': 0.005, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 500, 'objective': 'binary:logistic', 'random_state': 10, 'reg_alpha': 9, 'reg_lambda': 0, 'subsample': 0.6, 'verbosity': 0 } model_L = XGBRFClassifier(**model_L_params) model_L.fit(X_train_scl_L, y_train_L, eval_set=eval_set, eval_metric='auc', early_stopping_rounds=20, verbose=False) pred_L = model_L.predict(X_clouds_scl_L) clouds_R, _ = get_clouds(roiR) X_clouds_R = clouds_R.reshape((clouds_R.shape[0], -1)) X_clouds_scl_R = X_clouds_R / scale_rnd_R model_R_params = { 'colsample_bytree': 0.07, 'gamma': 0.005, 'max_depth': 3,
gamma = [i / 1000 for i in range(1, 11)] reg_alpha = [i for i in range(1, 11)] params_grid = [{'gamma': gamma, 'reg_alpha': reg_alpha, 'reg_lambda': [0]}] param_dists = params_grid n_iter = 25 early_stopping_rounds = 20 eval_set = [(X_eval_scl_L, y_eval_L)] fixed_params = { 'objective': 'binary:logistic', 'n_estimators': 500, 'random_state': 10, 'verbosity': 0 } estimator = XGBRFClassifier(**fixed_params, **var_params) crossval = RepeatedStratifiedKFold(n_splits=6, n_repeats=3, random_state=3) my_prec_scorer = make_scorer(precision_score, pos_label=class_names[0]) my_recall_scorer = make_scorer(recall_score, pos_label=class_names[0]) metrics = { 'accuracy': make_scorer(accuracy_score), 'precision': my_prec_scorer, 'recall': my_recall_scorer } print(f'# Tuning hyper-parameters') print() search_params = {
X = df.drop(columns=target) train, val, test, y_train, y_val, y_test = train_val_test_split(X, y) pipeline = make_pipeline(OrdinalEncoder(), SimpleImputer()) X_train = pipeline.fit_transform(train) X_val = pipeline.transform(val) X_test = pipeline.transform(test) eval_set = [(X_train, y_train), (X_val, y_val)] model = XGBRFClassifier(n_jobs=-1, n_estimators=5000, early_stopping_rounds=100, random_state=42, scale_pos_weight=15, learning_rate=.005, reg_lambda=.01, verbosity=1) print('fitting...') model.fit(X_train, y_train, eval_set=eval_set, eval_metric='auc', verbose=True) y_pred_proba = model.predict_proba(X_val)[:, 1] print(f'Validation ROC AUC score: {roc_auc_score(y_val, y_pred_proba)}') print('permuting...') permuter = PermutationImportance(model, cv='prefit', n_iter=5, scoring='roc_auc', random_state=42)
# param_test = { # 'max_depth':range(3,10,2), # 'min_child_weight':range(1,6,2) # } # #metrics to consider: f1_micro, f1_macro, roc_auc_ovr # gsearch1 = GridSearchCV(estimator = xgbrf_classifier, param_grid = param_test, scoring='f1_micro',n_jobs=1,verbose = 10, cv=5) # gsearch1.fit(trainDataset.X, trainDataset.Y[:, 0]) # results, best = getTrainScores(gsearch1) # print(results) # print(best) xgbrf_classifier = None if not arg.oneHot: xgbrf_classifier = XGBRFClassifier( learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=5 ) else: xgbrf_classifier = XGBRFClassifier( learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=3 ) print('[LOG] Fitting model...') xgbrf_classifier.fit(trainDataset.X, trainDataset.Y[:,0]) print('[LOG] Fitting done!') print('-- Model Report --') print('XGBoost train Accuracy: '+str(accuracy_score(xgbrf_classifier.predict(trainDataset.X), trainDataset.Y[:,0])))