# With early stopping. Use CV to find the best number of trees bst_cv = xgb.cv( params, d_train, num_boost_round=1000, nfold=5, verbose_eval=100, early_stopping_rounds=10, as_pandas=True, ) bst_cv[['train-auc-mean', 'test-auc-mean']].plot() bst = xgb.train(params, d_train, num_boost_round=60) print(bst.eval(d_valid)) """## Partial Dependency""" def partial_dependency(bst, X, y, feature): """ Calculate the dependency (or partial dependency) of a response variable on a predictor (or multiple predictors) 1. Sample a grid of values of a predictor. 2. For each value, replace every row of that predictor with this value, calculate the average prediction. """ X_temp = X.copy() grid = np.linspace(start=np.percentile(X_temp[feature], 0.1),
params['eta'] = best_params #Let’s have a look at the final list of tuned parameters. print(params) #Finally we can now use these tuned parameters in our xgboost model.Early stopping of 10 which means if the #model’s performance doesn’t improve under 10 rounds, then the model training will be stopped. """ Way1- To train final model""" # Training/Learning final Model final_xgb_model = xgb.train( params, dtrain, feval= custom_eval, num_boost_round= 1000, maximize=True, evals=[(dvalid, "Validation")], early_stopping_rounds=10 ) """ Way2- To train final model""" X_train, X_valid, y_train, y_valid = train_test_split(train_w2v, train["label"], test_size=0.3, random_state=0) final = xgb.XGBClassifier(**params,n_estimators=1000) final.fit(X_train,y_train) # Getiing f1 scores y_pred = final.predict(X_valid) f1Score = f1_score(y_valid, y_pred) print(f1Score*100) """ After different evaluation of different models we choose xgboost as our go to model and after hypertuning
y_train = train_label.iloc[:train_rows].values y_oof = np.zeros((x_train.shape[0])) acc_scores = [] kfold = StratifiedKFold(n_splits=N_SPLITS, random_state=SEED) params = { 'objective': 'binary:logistic', 'eval_metric': 'auc' } for i, (train_index, valid_index) in enumerate(kfold.split(x_train, y_train)): model = xgb X_A, X_B = x_train[train_index, :], x_train[valid_index, :] y_A, y_B = y_train[train_index], y_train[valid_index] dtrain = xgb.DMatrix(X_A, label=y_A) dvalid = xgb.DMatrix(X_B, label=y_B) evallist = [(dvalid, 'eval')] trainedModel = model.train(params, dtrain=dtrain, evals=evallist, num_boost_round=100, verbose_eval=False) # verbose_eval=False y_oof[valid_index] = trainedModel.predict(xgb.DMatrix(X_B)) acc_scores.append(accuracy_score(y_B, np.where(y_oof[valid_index]>0.5, 1, 0))) print(f'Accuracy (manual): {np.mean(acc_scores)*100:.4f}% ({np.std(acc_scores)*100:.3f})') # Accuracy (manual): 77.5640% (0.018) the same as 1-a ~ 1-c. # !SECTION LEARNING API USING BINARY:LOGISTIC """ MODEL 1-A ~ 1-D ALL FOUR GENEATE IDENTICAL RESULTS!! """ # SECTION MODEL 1-E MULTI:SOFTPROB # construction binary classes using multi:softprob (softmax similar) import xgboost as xgb x_train = all_df.drop('Survived', axis=1).iloc[:train_rows].values