예제 #1
0
# With early stopping. Use CV to find the best number of trees

bst_cv = xgb.cv(
    params,
    d_train,
    num_boost_round=1000,
    nfold=5,
    verbose_eval=100,
    early_stopping_rounds=10,
    as_pandas=True,
)

bst_cv[['train-auc-mean', 'test-auc-mean']].plot()

bst = xgb.train(params, d_train, num_boost_round=60)

print(bst.eval(d_valid))
"""## Partial Dependency"""


def partial_dependency(bst, X, y, feature):
    """
    Calculate the dependency (or partial dependency) of a response variable on a predictor (or multiple predictors)
    1. Sample a grid of values of a predictor.
    2. For each value, replace every row of that predictor with this value, calculate the average prediction.
    """

    X_temp = X.copy()

    grid = np.linspace(start=np.percentile(X_temp[feature], 0.1),
params['eta'] = best_params

#Let’s have a look at the final list of tuned parameters.
print(params)


#Finally we can now use these tuned parameters in our xgboost model.Early stopping of 10 which means if the
#model’s performance doesn’t improve under 10 rounds, then the model training will be stopped.

""" Way1- To train final model"""
# Training/Learning final Model
final_xgb_model = xgb.train(
    params,
    dtrain,
    feval= custom_eval,
    num_boost_round= 1000,
    maximize=True,
    evals=[(dvalid, "Validation")],
    early_stopping_rounds=10
 )
""" Way2- To train final model"""
X_train, X_valid, y_train, y_valid = train_test_split(train_w2v, train["label"], test_size=0.3, random_state=0)
final = xgb.XGBClassifier(**params,n_estimators=1000)
final.fit(X_train,y_train)
# Getiing f1 scores
y_pred = final.predict(X_valid)

f1Score = f1_score(y_valid, y_pred)
print(f1Score*100)

""" After different evaluation of different models we choose xgboost as our go to model and after hypertuning
y_train = train_label.iloc[:train_rows].values
y_oof = np.zeros((x_train.shape[0]))
acc_scores = []
kfold = StratifiedKFold(n_splits=N_SPLITS, random_state=SEED)
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc'
}
for i, (train_index, valid_index) in enumerate(kfold.split(x_train, y_train)):
    model = xgb
    X_A, X_B = x_train[train_index, :], x_train[valid_index, :]
    y_A, y_B = y_train[train_index], y_train[valid_index]
    dtrain = xgb.DMatrix(X_A, label=y_A)
    dvalid = xgb.DMatrix(X_B, label=y_B)
    evallist = [(dvalid, 'eval')]
    trainedModel = model.train(params, dtrain=dtrain, evals=evallist, num_boost_round=100, verbose_eval=False) # verbose_eval=False
    y_oof[valid_index] = trainedModel.predict(xgb.DMatrix(X_B))
    acc_scores.append(accuracy_score(y_B, np.where(y_oof[valid_index]>0.5, 1, 0)))

print(f'Accuracy (manual): {np.mean(acc_scores)*100:.4f}% ({np.std(acc_scores)*100:.3f})')
# Accuracy (manual): 77.5640% (0.018) the same as 1-a ~ 1-c.
# !SECTION LEARNING API USING BINARY:LOGISTIC 

"""
MODEL 1-A ~ 1-D ALL FOUR GENEATE IDENTICAL RESULTS!!
"""

# SECTION MODEL 1-E MULTI:SOFTPROB
# construction binary classes using multi:softprob (softmax similar)
import xgboost as xgb
x_train = all_df.drop('Survived', axis=1).iloc[:train_rows].values