def find_best_xgb_estimator(X, y, cv, param_comb): # Random search over specified parameter values for XGBoost. # Exhaustive search takes many more cycles w/o much benefit. # Returns optimized XGBoost estimator. # Ref: https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost print('\n Finding best XGBoost estimator...') param_grid = { 'min_child_weight': [1, 5, 10], 'gamma': [0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5] } init_est = xgb(learning_rate=0.02, n_estimators=600, objective='multi:softprob', verbose=1, nthread=1) random_search = RandomizedSearchCV(estimator=init_est, param_distributions=param_grid, n_iter=param_comb, n_jobs=4, iid=False, cv=cv, verbose=1, random_state=RANDOM_SEED) random_search.fit(X, y) #print('\n All results:') #print(random_search.cv_results_) print('\n Best estimator:') print(random_search.best_estimator_) print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (FOLDS, PARA_COMB)) print(random_search.best_score_ * 2 - 1) print('\n Best hyperparameters:') print(random_search.best_params_) return random_search.best_estimator_
def xg_boost(f_train, l_train, f_test): from xgboost import XGBClassifier as xgb clf = xgb(n_estimators=100) clf.fit(f_train, l_train) pred = clf.predict_proba(f_test) #print(pred) return pred
def XGBoost(self, args): ## Gradient Boosting logger.info("Running Gradient Boosting ... ") if args.predictor.lower() == 'classifier': from xgboost import XGBClassifier as xgb elif args.predictor.lower() == 'regressor': from xgboost import XGBRegressor as xgb xg_regression_model = xgb(objective='binary:logistic', n_estimator=20000, colsample_bytree=0.6, max_depth=6) ## Fit the regressor to the training set xg_regression_model.fit(self.X_train, self.y_train) ## Predict the labels self.y_pred = xg_regression_model.predict(self.X_data) if args.predictor.lower() == 'regressor': self.y_pred = logistic.cdf(self.y_pred) self.data['boosting_score'] = self.y_pred self.model = xg_regression_model return self
def best_model(xt, xv, yt, yv): models = [] name_dt = "DecisionTreeRegressor" model_dt = dtr(random_state=1) # decision tree model_dt.fit(xt, yt) models.append({'name': name_dt, 'model': model_dt, 'mae': get_mae(model_dt, xv, yv)}) name_rf = "RandomForestRegressor" model_rf = rfr(random_state=1) # random forest model_rf.fit(xt, yt) models.append({'name': name_rf, 'model': model_rf, 'mae': get_mae(model_rf, xv, yv)}) name_xgb = "XGBRegressor" model_xgb = xgb(random_state=1, n_estimators=10000, learning_rate=0.01) # xgboost model_xgb.fit(xt, yt, early_stopping_rounds=10, eval_set=[(xv, yv)], verbose=False) models.append({'name': name_xgb, 'model': model_xgb, 'mae': get_mae(model_xgb, xv, yv)}) print("\n") for m in models: print("Model {} has MAE {}".format(m.get('name'), m.get('mae'))) min_mae = min(i['mae'] for i in models) best_model = [m for m in models if m.get('mae') == min_mae] print("\nBest model pick: ", best_model[0].get('name')) print("\n") return best_model[0].get('model')
def xg_boost(): global features_train, labels_train, features_test from xgboost import XGBClassifier as xgb clf = xgb() clf.fit(features_train, labels_train) pred = clf.predict(features_test) return pred
def trainXgboost(self, x_train, y_train, user_test_data): X_train, x_valid, y_train, y_valid = train_test_split( x_train, y_train, test_size=0.2, random_state=4242) modelXGB = xgb(max_depth=4, n_estimators=500, learning_rate=0.05) modelXGB.fit(X_train, y_train.values.ravel()) # predictions_probaXGB = modelXGB.predict_proba(x_valid) # predictionsXGB = modelXGB.predict(x_valid) # predictions = [round(value) for value in predictionsXGB] # # # # predictionsXGB=modelXGB.predict(text) # log_loss_score_XGB = log_loss(y_valid, predictions_probaXGB) # acc_XGB = accuracy_score(y_valid, predictions) # f1_XGB = f1_score(y_valid, predictions) # # print("XGBoost Classifier ") # print('Log loss: %.5f' % log_loss_score_XGB) # print('Acc: %.5f' % (acc_XGB * 100.0)) # print('F1: %.5f' % f1_XGB) predictions_test = modelXGB.predict(user_test_data) predictions_test_binary = [round(value) for value in predictions_test] print('score for test data : ', predictions_test_binary) return predictions_test_binary
def test_meta_classifier(): print("Start step of classes prediction") df_meta = pd.read_csv('meta_added_class.csv') X = df_meta.to_numpy()[:, 2:-1] y = df_meta.to_numpy()[:, -1].astype(float) y_predicted = np.zeros(y.shape[0], dtype =float) for i in range(X.shape[0]): classifier = xgb() map = np.ones(X.shape[0], dtype = bool) map[i] = False X_all = X[map,:] y_all = y[map] classifier.fit(X_all, y_all) X_to_predict = np.zeros((1,X.shape[1])) X_to_predict[0] = X[i] #data set in row i y_predicted[i] = classifier.predict(X_to_predict) np.savetxt('y_predicted.csv', y_predicted) ACC, TPR, FPR, PPV, AUC_roc, AUC_pr = common.test_measurements(y, y_predicted) precision, recall, _ = common.precision_recall_curve(y, y_predicted) AUC_pr = common.auc(recall, precision) meta_results = pd.DataFrame(columns = ['ACC', 'TPR', 'FPR', 'PPV', 'AUC_roc', 'AUC_pr']) meta_results.loc[len (meta_results)] = [ACC, TPR, FPR, PPV, AUC_roc, AUC_pr] meta_results.to_csv('meta_results.csv') print("Finished step of classes prediction")
def voting_pitchers(to_predict_pitchers, pitcher_predictions, x_pitchers, xgb_pitchers_params, rforest_pitchers_params, logreg_pitchers_params, svm_pitchers_params): # pitchers voting classifier using the optimal parameters accuracy_list_voting_pitchers = [] accuracy_list_voting_pitchers_ERA = [] accuracy_list_voting_pitchers_K = [] accuracy_list_voting_pitchers_W = [] accuracy_list_voting_pitchers_WHIP = [] i = 0 col_list = ['correct_ERA', 'correct_K', 'correct_W', 'correct_WHIP'] for col in col_list: svm_pitchers_params[i][col]['probability'] = True i += 1 for i in xrange(10): j = 0 for col in to_predict_pitchers: y = pitcher_predictions[col].tolist() clf1 = xgb(**xgb_pitchers_params[j][col]) clf2 = RandomForestClassifier(**rforest_pitchers_params[j][col]) clf3 = linear_model.LogisticRegression( **logreg_pitchers_params[j][col]) #clf5 = QuadraticDiscriminantAnalysis(**qda_pitchers_params[j][col]) clf4 = svm.SVC(**svm_pitchers_params[j][col]) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('svm', clf4)], voting='soft') #eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft') scores = cross_val_score(eclf, x_pitchers, y, cv=5, scoring='accuracy', n_jobs=-1) acc = scores.mean() accuracy_list_voting_pitchers.append(acc) if col == 'correct_ERA': accuracy_list_voting_pitchers_ERA.append(acc) elif col == 'correct_K': accuracy_list_voting_pitchers_K.append(acc) elif col == 'correct_W': accuracy_list_voting_pitchers_W.append(acc) elif col == 'correct_WHIP': accuracy_list_voting_pitchers_WHIP.append(acc) j += 1 print "%-15s" % 'overall average', np.mean(accuracy_list_voting_pitchers) print "%-15s" % 'correct_ERA', np.mean(accuracy_list_voting_pitchers_ERA) print "%-15s" % 'correct_K', np.mean(accuracy_list_voting_pitchers_K) print "%-15s" % 'correct_W', np.mean(accuracy_list_voting_pitchers_W) print "%-15s" % 'correct_WHIP', np.mean(accuracy_list_voting_pitchers_WHIP)
def tree_model(train_data, train_labels, test_data, test_labels): clf = xgb() clf.fit(train_data, train_labels) preds = clf.predict(test_data) print('XGB Accuracy {}'.format( (preds == test_labels).sum() / len(test_labels))) confusion(preds, test_labels)
def voting_hitters(to_predict_hitters, hitter_predictions, x_hitters, xgb_hitters_params, rforest_hitters_params, logreg_hitters_params, qda_hitters_params): accuracy_list_voting_hitters = [] accuracy_list_voting_hitters_AVG = [] accuracy_list_voting_hitters_HR = [] accuracy_list_voting_hitters_R = [] accuracy_list_voting_hitters_RBI = [] accuracy_list_voting_hitters_SB = [] for i in xrange(10): j = 0 for col in to_predict_hitters: y = hitter_predictions[col].tolist() clf1 = xgb(**xgb_hitters_params[j][col]) clf2 = RandomForestClassifier(**rforest_hitters_params[j][col]) clf3 = linear_model.LogisticRegression( **logreg_hitters_params[j][col]) clf4 = QuadraticDiscriminantAnalysis(**qda_hitters_params[j][col]) #clf4 = svm.SVC(**svm_hitters_params[j][col]) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('qda', clf4)], voting='soft') scores = cross_val_score(eclf, x_hitters, y, cv=5, scoring='accuracy', n_jobs=-1) acc = scores.mean() accuracy_list_voting_hitters.append(acc) if col == 'correct_AVG': accuracy_list_voting_hitters_AVG.append(acc) elif col == 'correct_HR': accuracy_list_voting_hitters_HR.append(acc) elif col == 'correct_R': accuracy_list_voting_hitters_R.append(acc) elif col == 'correct_RBI': accuracy_list_voting_hitters_RBI.append(acc) elif col == 'correct_SB': accuracy_list_voting_hitters_SB.append(acc) j += 1 print "%-15s" % 'overall average', np.mean(accuracy_list_voting_hitters) print "%-15s" % 'correct_AVG', np.mean(accuracy_list_voting_hitters_AVG) print "%-15s" % 'correct_HR', np.mean(accuracy_list_voting_hitters_HR) print "%-15s" % 'correct_R', np.mean(accuracy_list_voting_hitters_R) print "%-15s" % 'correct_RBI', np.mean(accuracy_list_voting_hitters_RBI) print "%-15s" % 'correct_SB', np.mean(accuracy_list_voting_hitters_SB)
def train_model(train_x, train_y, model_type): model = None if model_type is 'XGB': model = xgb(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8) model.fit(train_x, train_y) return model
def xg_hitters_params(to_predict_hitters, x_hitters, hitter_predictions): best_params = [] for col in to_predict_hitters: y = hitter_predictions[col].tolist() x_train, x_test, y_train, y_test = train_test_split(x_hitters, y) xgb_classifier = xgb() parameters = {'max_depth': [3,5,9], 'learning_rate': [.1,.4], "n_estimators": [250,350], \ 'reg_lambda': [1,4]} clf = GridSearchCV(xgb_classifier, parameters) clf.fit(x_train, y_train) best_params.append({col:clf.best_params_}) return best_params
def xg_hitters_params(to_predict_hitters, x_hitters, hitter_predictions): best_params = [] for col in to_predict_hitters: y = hitter_predictions[col].tolist() x_train, x_test, y_train, y_test = train_test_split(x_hitters, y) xgb_classifier = xgb() parameters = {'max_depth': [3,5,9], 'learning_rate': [.1,.4], "n_estimators": [250,350], \ 'reg_lambda': [1,4]} clf = GridSearchCV(xgb_classifier, parameters) clf.fit(x_train, y_train) best_params.append({col: clf.best_params_}) return best_params
def trainforAllModel(self, x_train, y_train): X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.4, random_state=4242) model = RandomForestClassifier(50, n_jobs=8) model.fit(X_train, y_train.values.ravel()) predictions_proba = model.predict_proba(X_test) predictions = model.predict(X_test) log_loss_score = log_loss(y_test, predictions_proba) acc = accuracy_score(y_test, predictions) f1 = f1_score(y_test, predictions) print("RandonForest Classifier ") print('Log loss: %.5f' % log_loss_score) print('Acc: %.5f' % acc) print('F1: %.5f' % f1) modelXGB = xgb(n_estimators=500) modelXGB.fit(X_train, y_train.values.ravel()) predictions_probaXGB = modelXGB.predict_proba(X_test) predictionsXGB = modelXGB.predict(X_test) log_loss_score_XGB = log_loss(y_test, predictions_probaXGB) acc_XGB = accuracy_score(y_test, predictionsXGB) f1_XGB = f1_score(y_test, predictionsXGB) print("XGBoost Classifier ") print('Log loss: %.5f' % log_loss_score_XGB) print('Acc: %.5f' % acc_XGB) print('F1: %.5f' % f1_XGB) clf = GaussianNB() clf.fit(X_train, y_train.values.ravel()) predictions_probaNB = clf.predict_proba(X_test) predictionsNB = clf.predict(X_test) log_loss_score_Naiye_Bayes = log_loss(y_test, predictions_probaNB) acc_Naiye_Bayes = accuracy_score(y_test, predictionsNB) f1_Naiye_Bayes = f1_score(y_test, predictionsNB) print("Naiye_Bayes Classifier ") print('Log loss: %.5f' % log_loss_score_Naiye_Bayes) print('Acc: %.5f' % acc_Naiye_Bayes) print('F1: %.5f' % f1_Naiye_Bayes)
def voting_pitchers(to_predict_pitchers, pitcher_predictions, x_pitchers, xgb_pitchers_params, rforest_pitchers_params, logreg_pitchers_params, svm_pitchers_params): # pitchers voting classifier using the optimal parameters accuracy_list_voting_pitchers = [] accuracy_list_voting_pitchers_ERA = [] accuracy_list_voting_pitchers_K = [] accuracy_list_voting_pitchers_W = [] accuracy_list_voting_pitchers_WHIP = [] i=0 col_list = ['correct_ERA', 'correct_K', 'correct_W', 'correct_WHIP'] for col in col_list: svm_pitchers_params[i][col]['probability'] = True i+=1 for i in xrange(10): j=0 for col in to_predict_pitchers: y = pitcher_predictions[col].tolist() clf1 = xgb(**xgb_pitchers_params[j][col]) clf2 = RandomForestClassifier(**rforest_pitchers_params[j][col]) clf3 = linear_model.LogisticRegression(**logreg_pitchers_params[j][col]) #clf5 = QuadraticDiscriminantAnalysis(**qda_pitchers_params[j][col]) clf4 = svm.SVC(**svm_pitchers_params[j][col]) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('svm', clf4)], voting='soft') #eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft') scores = cross_val_score(eclf, x_pitchers, y, cv=5, scoring='accuracy', n_jobs=-1) acc = scores.mean() accuracy_list_voting_pitchers.append(acc) if col == 'correct_ERA': accuracy_list_voting_pitchers_ERA.append(acc) elif col == 'correct_K': accuracy_list_voting_pitchers_K.append(acc) elif col == 'correct_W': accuracy_list_voting_pitchers_W.append(acc) elif col == 'correct_WHIP': accuracy_list_voting_pitchers_WHIP.append(acc) j+=1 print "%-15s" % 'overall average', np.mean(accuracy_list_voting_pitchers) print "%-15s" % 'correct_ERA', np.mean(accuracy_list_voting_pitchers_ERA) print "%-15s" % 'correct_K', np.mean(accuracy_list_voting_pitchers_K) print "%-15s" % 'correct_W', np.mean(accuracy_list_voting_pitchers_W) print "%-15s" % 'correct_WHIP', np.mean(accuracy_list_voting_pitchers_WHIP)
def xg_pitchers_params(to_predict_pitchers, x_pitchers, pitcher_predictions): ### running a grid search cross validation on XGBoost for pitchers to obtain the best parameters best_params = [] for col in to_predict_pitchers: y = pitcher_predictions[col].tolist() x_train, x_test, y_train, y_test = train_test_split(x_pitchers, y) xgb_classifier = xgb() parameters = {'max_depth': [3,5,9], 'learning_rate': [.05,.1], "n_estimators": [250,350], \ 'reg_lambda': [1,3,6]} clf = GridSearchCV(xgb_classifier, parameters) clf.fit(x_train, y_train) best_params.append({col:clf.best_params_}) return best_params
def xg_pitchers_params(to_predict_pitchers, x_pitchers, pitcher_predictions): ### running a grid search cross validation on XGBoost for pitchers to obtain the best parameters best_params = [] for col in to_predict_pitchers: y = pitcher_predictions[col].tolist() x_train, x_test, y_train, y_test = train_test_split(x_pitchers, y) xgb_classifier = xgb() parameters = {'max_depth': [3,5,9], 'learning_rate': [.05,.1], "n_estimators": [250,350], \ 'reg_lambda': [1,3,6]} clf = GridSearchCV(xgb_classifier, parameters) clf.fit(x_train, y_train) best_params.append({col: clf.best_params_}) return best_params
def gradient_boosting(df): '''Xgboost model using sub set of features that have already been engineered to work, applies standard scaling and trains model, serializes model to disk with pickle and outputs metrics''' FILENAME='model' OUTFILE=open(FILENAME, 'wb') SCALE='scaler' SCALER=open(SCALE, 'wb') # Create df for model training y = df[['price']] x = df[['accommodates','bedrooms','bathrooms','cleaning_fee','distance','size']] # Typically at this stage we would conduct some form of feature exploration, selection # and feature engineering. For the sake of time during the recording we have already # performed minor feature analysis and selection. We have also already conducted # hyper-parameter tuning using grid search cross-validation and will be hard coding # those params for our xgboost model. We could extend this project and improve the # models accuracy by performing further feature engineering using various NLP techniques # but will stick to using some basic int data types as features to predict the target # variable price. With our pre-defined feature set we jump into model training. # Create training/test set for training sc = StandardScaler() x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) x_train=sc.fit_transform(x_train) pickle.dump(sc, SCALER) SCALER.close() x_test=sc.transform(x_test) # Xgboost parameters hard coded after grid-search cross validation booster=xgb(n_estimators=200,random_state=4,gamma=0.2,max_depth=6,learning_rate=0.1, colsample_bytree=0.7 ) # Fit model make predictions on test set and output the metrics booster.fit(x_train,y_train) pickle.dump(booster,OUTFILE) OUTFILE.close() # Validate model is predicting y_preds = booster.predict(x_test) for i in y_preds: print("$", round(i, 2), "/ night") return y_preds
def xgb_cv(max_depth, gamma, colsample_bytree, data, targets): estimator = xgb( n_estimators=250, learning_rate=0.08, n_jobs=4, max_depth=max_depth, gamma=gamma, colsample_bytree=colsample_bytree, ) ## cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0) # scores=cross_val_score(classifier, X, Y, cv=cv) cval = cross_val_score(estimator, data, targets, scoring='neg_log_loss', cv=5) return cval.mean()
def meta_classifier(): print("Start step of features importance") df_meta = pd.read_csv('meta_added_class.csv') feature_importance_measures = ['gain', 'weight', 'cover'] importance_results = pd.DataFrame(columns=feature_importance_measures) for importance_key in feature_importance_measures: classifier = xgb(importance_type=importance_key) X = df_meta.to_numpy()[:, 2:-1] #X includes all features y = df_meta.to_numpy()[:, -1] #y includes class (=algorithm) classifier.fit(X,y) importance_results[importance_key] = classifier.feature_importances_ dmat = DMatrix(X) shap = classifier.get_booster().predict(dmat,pred_contribs = True) np.savetxt('shap.csv',shap) importance_results.to_csv('importance results.csv') print("Finished step of features importance")
def voting_hitters(to_predict_hitters, hitter_predictions, x_hitters, xgb_hitters_params, rforest_hitters_params, logreg_hitters_params, qda_hitters_params): accuracy_list_voting_hitters = [] accuracy_list_voting_hitters_AVG = [] accuracy_list_voting_hitters_HR = [] accuracy_list_voting_hitters_R = [] accuracy_list_voting_hitters_RBI = [] accuracy_list_voting_hitters_SB = [] for i in xrange(10): j=0 for col in to_predict_hitters: y = hitter_predictions[col].tolist() clf1 = xgb(**xgb_hitters_params[j][col]) clf2 = RandomForestClassifier(**rforest_hitters_params[j][col]) clf3 = linear_model.LogisticRegression(**logreg_hitters_params[j][col]) clf4 = QuadraticDiscriminantAnalysis(**qda_hitters_params[j][col]) #clf4 = svm.SVC(**svm_hitters_params[j][col]) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('qda', clf4)], voting='soft') scores = cross_val_score(eclf, x_hitters, y, cv=5, scoring='accuracy', n_jobs = -1) acc = scores.mean() accuracy_list_voting_hitters.append(acc) if col == 'correct_AVG': accuracy_list_voting_hitters_AVG.append(acc) elif col == 'correct_HR': accuracy_list_voting_hitters_HR.append(acc) elif col == 'correct_R': accuracy_list_voting_hitters_R.append(acc) elif col == 'correct_RBI': accuracy_list_voting_hitters_RBI.append(acc) elif col == 'correct_SB': accuracy_list_voting_hitters_SB.append(acc) j+=1 print "%-15s" % 'overall average', np.mean(accuracy_list_voting_hitters) print "%-15s" % 'correct_AVG', np.mean(accuracy_list_voting_hitters_AVG) print "%-15s" % 'correct_HR', np.mean(accuracy_list_voting_hitters_HR) print "%-15s" % 'correct_R', np.mean(accuracy_list_voting_hitters_R) print "%-15s" % 'correct_RBI', np.mean(accuracy_list_voting_hitters_RBI) print "%-15s" % 'correct_SB', np.mean(accuracy_list_voting_hitters_SB)
def Get_xgboostScore(X, y): N_ESTIMATORS = 300 CV_FOLD = 5 loss_list = [] for i in range(CV_FOLD): test_idx_start = int(X.shape[0] / CV_FOLD) * i test_idx_end = int(X.shape[0] / CV_FOLD) * (i + 1) X_train = pd.concat([X[:test_idx_start], X[test_idx_end:]], axis=0) y_train = pd.concat([y[:test_idx_start], y[test_idx_end:]], axis=0) X_test = X[test_idx_start:test_idx_end] y_test = y[test_idx_start:test_idx_end] # print( X_train.shape, y_train.shape, X_test.shape, y_test.shape ) model = xgb(max_depth=5, n_estimators=N_ESTIMATORS) model.fit(X_train, y_train) loss_list.append(get_scores(model, X_train, y_train, X_test, y_test)) print('Average Error:{:.6f}'.format(np.mean(loss_list))) return
def _feature_selection(self, X, y, Xv, yv): '''_FEATURE_SELECTION Apply XGBoost to do feature selection. Inputs: ------- - X: numpy ndarray, features of training set. - y: numpy ndarray, labels of training set. - Xv: numpy ndarray, features of validation set. - yv: numpy ndarray, laebls of validation set. Outputs: -------- - clf: instance of XGBClassifier, trian model. - fs_idx: list, indecies of selected features. - importance: list, importance of selected features. ''' # Train XGBoost classifier clf = xgb(**self.xgb_paras) clf.fit(X, y, eval_set=[(X, y), (Xv, yv)], eval_metric="error", verbose=False) # Extract indices of important features importance = clf.feature_importances_ fs_idx = np.where(importance > self.threshold)[0] importance = importance[fs_idx] print("Number of important features: ", len(fs_idx)) return clf, fs_idx, importance
def stat_pct(stat): """ Inputs: stat (str): The statistic of interest - AVG, HR, R, RBI, SB for hitters, ERA, K, W, WHIP for pitchers Returns: vals ((# of players,3) ndarray): first column is player name, second column is predicted value for given statistic, third column is probability that the prediction is correct """ # check that values given are valid if stat not in all_stats: print "Not an acceptable stat" return 'FAILED' if stat in hit_stats: # get a list of the names of all the hitters in the order that they appear in x_hitters2017 name_list = hitter_predictions_2017['Name'].tolist() # run the model if models[stat] == 'XGBoost': # run XGBoost with best params for the stat xgbc = xgb(**best_params_all['correct_' + stat]) model = xgbc.fit(x_hitters, y_vals[stat]) preds = model.predict_proba(x_hitters2017)[:, 1] if models[stat] == 'Random Forest': # run Random Forest with best params for the stat rf = RandomForestClassifier(**best_params_all['correct_' + stat]) model = rf.fit(x_hitters, y_vals[stat]) preds = model.predict_proba(x_hitters2017)[:, 1] # empty array to store names, stat predictions, and pct probability vals = np.empty((len(np.unique(name_list)), 3)) # get list of unique names unique_names = np.unique(name_list) # create lists to store percent probabilities and stat predictions pcts = np.zeros(len(unique_names)) stats = np.zeros(len(unique_names)) # loop through each player for j in xrange(len(unique_names)): # get indices of player idxs = [] for x in xrange(len(name_list)): if name_list[x] == unique_names[j]: idxs.append(x) # find highest probability for given player, store the index and probability value vals_dict = dict((i, preds[i]) for i in idxs) b = collections.defaultdict(list) for key, value in vals_dict.iteritems(): b[value].append(key) pcts[j] = max(b.items())[0] #find corresponding value of stat stats[j] = hitter_predictions_2017[stat][max(b.items())[1][0]] vals[:, 0] = stats vals[:, 1] = pcts return vals, unique_names else: # get a list of the names of all pitchers in the order that they apear in x_pitchers2017 name_list = pitcher_predictions_2017['Name'].tolist() # run the model if models[stat] == 'XGBoost': # run XGBoost with best params for the stat xgbc = xgb(**best_params_all['correct_' + stat]) model = xgbc.fit(x_pitchers, y_vals[stat]) preds = model.predict_proba(x_pitchers2017)[:, 1] if models[stat] == 'Random Forest': # run Random Forest with best params for the stat rf = RandomForestClassifier(**best_params_all['correct_' + stat]) model = rf.fit(x_pitchers, y_vals[stat]) preds = model.predict_proba(x_pitchers2017)[:, 1] # empty array to store names, stat predictions, and pct probability vals = np.empty((len(np.unique(name_list)), 3)) # get list of unique names unique_names = np.unique(name_list) # create lists to store percent probabilities and stat predictions pcts = np.zeros(len(unique_names)) stats = np.zeros(len(unique_names)) # loop through each player for j in xrange(len(unique_names)): # get indices of player idxs = [] for x in xrange(len(name_list)): if name_list[x] == unique_names[j]: idxs.append(x) # find highest probability for given player, store the index and probability value vals_dict = dict((i, preds[i]) for i in idxs) b = collections.defaultdict(list) for key, value in vals_dict.iteritems(): b[value].append(key) pcts[j] = max(b.items())[0] #find corresponding value of stat stats[j] = pitcher_predictions_2017[stat][max(b.items())[1][0]] vals[:, 0] = stats vals[:, 1] = pcts return vals, unique_names
"""**************************************************************************************************""" """ 4) XGBoost """ """ Bag of Words Features """ # Splitting my data into train and test train_bow = bow[:31962,:] test_bow = bow[31962:,:] # Splitting my data into train and validation data X_train, X_valid, y_train, y_valid = train_test_split(train_bow, train["label"], test_size=0.3, random_state=0) """ Instantiting the xgboost classifier """ from xgboost import XGBClassifier as xgb classifier = xgb(n_estimators=2000, max_depth=6) classifier.fit(X_train, y_train) # Getiing f1 scores y_pred = classifier.predict(X_valid) f1Score = f1_score(y_valid, y_pred) print(f1Score*100) """ TFIDF """ # Splitting my data into train and test train_idf = tfidf[:31962,:] test_idf = tfidf[31962:,:] # Splitting my data into train and validation data X_train, X_valid, y_train, y_valid = train_test_split(train_idf, train["label"], test_size=0.3, random_state=0)
from sklearn.metrics import mean_absolute_error as mae from sklearn.model_selection import train_test_split as tts from sklearn.preprocessing import Imputer path_tr = 'C:/Users/satyam/Desktop/kaggle/House_Prices/train.csv' train = pd.read_csv(path_tr) #Data path_te = 'C:/Users/satyam/Desktop/kaggle/House_Prices/test.csv' test = pd.read_csv(path_te) my_imputer = Imputer() target = train.SalePrice data = pd.concat([train.drop(['SalePrice'], axis=1), test]) numeric = data.select_dtypes(exclude=['object']) filled_data = my_imputer.fit_transform( numeric) #Using Imputer to fill up missing values train_f = filled_data[:1460] test_f = filled_data[1460:] model = xgb(n_estimators=1000, learning_rate=0.05 ) #Using XGBoost Model along with estimator and learning rate model.fit(train_f, target, early_stopping_rounds=5, eval_set=[(train_f, target)], verbose=False) predictions = model.predict(test_f) #Making predictions my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predictions}) my_submission.to_csv('XGBoost+Imputer+est+LR+ESR.csv', index=False)
Sys_X_model, Sys_X_test, Sys_Y_model, Sys_Y_test = train_test_split( Sys_X, Sys_Y, test_size=size, random_state=seed) S1_X_train, S1_X_valid, S1_Y_train, S1_Y_valid = train_test_split( S1_X_model, S1_Y_model, test_size=size, random_state=seed) S2_X_train, S2_X_valid, S2_Y_train, S2_Y_valid = train_test_split( S2_X_model, S2_Y_model, test_size=size, random_state=seed) S3_X_train, S3_X_valid, S3_Y_train, S3_Y_valid = train_test_split( S3_X_model, S3_Y_model, test_size=size, random_state=seed) S4_X_train, S4_X_valid, S4_Y_train, S4_Y_valid = train_test_split( S4_X_model, S4_Y_model, test_size=size, random_state=seed) Sys_X_train, Sys_X_valid, Sys_Y_train, Sys_Y_valid = train_test_split( Sys_X_model, Sys_Y_model, test_size=size, random_state=seed) #Use XGBoost to show feature importance per station model1 = xgb().fit(S1_X_train, S1_Y_train) model2 = xgb().fit(S2_X_train, S2_Y_train) model3 = xgb().fit(S3_X_train, S3_Y_train) model4 = xgb().fit(S4_X_train, S4_Y_train) #Shows the XGBoost-derived feature importances in graph form. plot_importance(model1) plot_importance(model2) plot_importance(model3) plot_importance(model4) pyplot.show() #use sort to find the thresholds for SelectFromModel thresholdS1 = sort(model1.feature_importances_) thresholdS2 = sort(model2.feature_importances_) thresholdS3 = sort(model3.feature_importances_)
list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values)) data_train[col] = le.transform(list(data_train[col].astype(str).values)) data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values)) print('Label Encoding 完成') features = [ f for f in data_train.columns if f not in ['id', 'issueDate', 'isDefault'] and '_outliers' not in f ] x_train = data_train[features] x_valid = data_test_a[features] y_train = data_train['isDefault'] trn_x, val_x, trn_y, val_y = train_test_split(x_train, y_train) clf = xgb() clf.fit(trn_x, trn_y) pre = clf.predict(val_x) print(roc_auc_score(val_y, pre)) lgb = LGBMClassifier() lgb.fit(trn_x, trn_y) pre = lgb.predict(val_x) print(roc_auc_score(val_y, pre)) cat = CatBoostRegressor() cat.fit(trn_x, trn_y) pre = cat.predict(val_x) print(roc_auc_score(val_y, pre))
}, { 'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] }] grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1) grid_search = grid_search.fit(iv_train, dv_train) best_accuracy = grid_search.best_score_ best_parameters = grid_search.best_params_ # In[ ]: #finally trying with advanced XGBOOST algorithm # Fitting XGBoost to the Training set from xgboost import xgb classifier = xgb() classifier.fit(iv_train, dv_train) # Predicting the Test set results y_pred = classifier.predict(iv_test) # In[ ]: # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(dv_test, dv_predict)
"geoNetwork_country", "flight_day", "TRIPTYPEDESC", "SALESCHANNEL", ] ) y = df["INS_FLAG"] ## train test split size, random seed X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=22 ) ## Model 1 (baseline) - XGBoost Classifier xgr = xgb(n_estimators=3000, max_depth=7) # fit the model to train data set xgr.fit(X_train, y_train, eval_metric="auc", verbose=200) predictions = xgr.predict(X_test) # print(predictions) ## overall model accucary from sklearn import metrics predictions = xgr.predict(X_test) print( "Accuracy:", metrics.accuracy_score(y_test, predictions) ) # 83% accuracy on imbalanced dataset print("Precision:", metrics.precision_score(y_test, predictions)) print("Recall:", metrics.recall_score(y_test, predictions))
def process(object): train_X, train_y, test_X, test_y = object['train_X'], object[ 'train_y'], object['test_X'], object['test_y'] # init sample method # sample_methods = ['random', 'SMOTE', 'Sparse SMOTE', 'SMOTEBorderline-1', 'SMOTEBorderline-2', # 'SVMSMOTE', 'ADASYN', 'No Sample'] sample_methods = ['Sparse SMOTE'] # sample_methods = ['SMOTE'] # sample_methods = ['random', 'smote', 'adasyn', 'mwmote'] metrics_dict = {} time_info = {} for sample_method in sample_methods: # before before_time = datetime.now() # over sample X_resampled, y_resampled = oversample(train_X, train_y, method=sample_method) statistics_sample_num(train_X, train_y, X_resampled, y_resampled, sample_method) # after over_time = datetime.now() process_time = ((over_time - before_time).microseconds) * 1.0 / (10**6) # print(process_time) time_info[sample_method] = "%.3f" % process_time # create model gbm = xgb(max_depth=3, n_estimators=300, learning_rate=0.01) # gbm = xgb(max_depth=3, n_estimators=300, learning_rate=0.01, max_delta_step=0.1) # train model gbm.fit(X_resampled, y_resampled, eval_metric='auc') # evaluate on test set precision, recall, f1, gmean, auc_roc, auc_pr, fpr, tpr = evaluate( test_X, test_y, gbm) roc_auc = auc(fpr, tpr) if SHOW_AUC_ROC_PLOT: plt.plot(fpr, tpr, lw=1, alpha=0.3, label='%s (AUC = %0.2f)' % (sample_method, roc_auc)) metrics_dict[sample_method] = { "precision": precision, "recall": recall, "f1": f1, "gmean": gmean, "auc_roc": auc_roc, "auc_pr": auc_pr } df = pd.DataFrame(metrics_dict) # df.set_index(['precision', 'recall', 'gmean', 'f1'], inplace=True) df = df.T # print(df) if SHOW_METRICS: for index, row in df.iterrows(): print "&" + index + "&", # output auc_roc, auc_pr, precision, recall, f1, gmean if row["auc_roc"] >= df["auc_roc"].max(): print r"\textbf{%.3f" % row["auc_roc"] + "}&", else: print "%.3f" % row["auc_roc"] + "&", if row["auc_pr"] >= df["auc_pr"].max(): print r"\textbf{%.3f" % row["auc_pr"] + "}&", else: print "%.3f" % row["auc_pr"] + "&", if row["precision"] >= df["precision"].max(): print r"\textbf{%.3f" % row["precision"] + "}&", else: print "%.3f" % row["precision"] + "&", if row["recall"] >= df["recall"].max(): print r"\textbf{%.3f" % row["recall"] + "}&", else: print "%.3f" % row["recall"] + "&", if row["f1"] >= df["f1"].max(): print r"\textbf{%.3f" % row["f1"] + "}&", else: print "%.3f" % row["f1"] + "&", if row["gmean"] >= df["gmean"].max(): print(r"\textbf{%.3f" % row["gmean"] + r"}\\") else: print("%.3f" % row["gmean"] + r"\\") # evaluate(X, y, "No Sample", gbm) return time_info
min_samples_leaf=2, n_estimators=100, subsample=0.8) gbdt.fit(iris.data, iris.target) gbdt.predict(iris.data) gbdt.predict_proba(iris.data) # ## xgboost test def scorebyself(self, X, y): from sklearn.metrics import roc_auc_score probas = self.predict_proba(X) auc = roc_auc_score(y, probas) return auc from xgboost import XGBClassifier as xgb params = { 'n_estimators': [1000, 500, 100], 'subsample': [0.5, 0.8], 'learning_rate': [0.01, 0.05] } gsmodel = xgb() xgbmodel0 = GridSearchCV(gsmodel, params, cv=5, n_jobs=5) xgbmodel0.fit(iris.data, iris.target) xgbest = xgb(learning_rate=0.01, n_estimators=1000, subsample=0.5, max_depth=3) xgbest.fit(iris.data, iris.target) xgbest.predict(iris.data) xgbest.predict_proba(iris.data)
def stat_pct(stat): """ Inputs: stat (str): The statistic of interest - AVG, HR, R, RBI, SB for hitters, ERA, K, W, WHIP for pitchers Returns: vals ((# of players,3) ndarray): first column is player name, second column is predicted value for given statistic, third column is probability that the prediction is correct """ # check that values given are valid if stat not in all_stats: print "Not an acceptable stat" return 'FAILED' if stat in hit_stats: # get a list of the names of all the hitters in the order that they appear in x_hitters2017 name_list = hitter_predictions_2017['Name'].tolist() # run the model if models[stat] == 'XGBoost': # run XGBoost with best params for the stat xgbc = xgb(**best_params_all['correct_' + stat]) model = xgbc.fit(x_hitters, y_vals[stat]) preds = model.predict_proba(x_hitters2017)[:,1] if models[stat] == 'Random Forest': # run Random Forest with best params for the stat rf = RandomForestClassifier(**best_params_all['correct_' + stat]) model = rf.fit(x_hitters, y_vals[stat]) preds = model.predict_proba(x_hitters2017)[:,1] # empty array to store names, stat predictions, and pct probability vals = np.empty((len(np.unique(name_list)), 3)) # get list of unique names unique_names = np.unique(name_list) # create lists to store percent probabilities and stat predictions pcts = np.zeros(len(unique_names)) stats = np.zeros(len(unique_names)) # loop through each player for j in xrange(len(unique_names)): # get indices of player idxs = [] for x in xrange(len(name_list)): if name_list[x] == unique_names[j]: idxs.append(x) # find highest probability for given player, store the index and probability value vals_dict = dict((i, preds[i]) for i in idxs) b = collections.defaultdict(list) for key, value in vals_dict.iteritems(): b[value].append(key) pcts[j] = max(b.items())[0] #find corresponding value of stat stats[j] = hitter_predictions_2017[stat][max(b.items())[1][0]] vals[:,0] = stats vals[:,1] = pcts return vals, unique_names else: # get a list of the names of all pitchers in the order that they apear in x_pitchers2017 name_list = pitcher_predictions_2017['Name'].tolist() # run the model if models[stat] == 'XGBoost': # run XGBoost with best params for the stat xgbc = xgb(**best_params_all['correct_' + stat]) model = xgbc.fit(x_pitchers, y_vals[stat]) preds = model.predict_proba(x_pitchers2017)[:,1] if models[stat] == 'Random Forest': # run Random Forest with best params for the stat rf = RandomForestClassifier(**best_params_all['correct_' + stat]) model = rf.fit(x_pitchers, y_vals[stat]) preds = model.predict_proba(x_pitchers2017)[:,1] # empty array to store names, stat predictions, and pct probability vals = np.empty((len(np.unique(name_list)), 3)) # get list of unique names unique_names = np.unique(name_list) # create lists to store percent probabilities and stat predictions pcts = np.zeros(len(unique_names)) stats = np.zeros(len(unique_names)) # loop through each player for j in xrange(len(unique_names)): # get indices of player idxs = [] for x in xrange(len(name_list)): if name_list[x] == unique_names[j]: idxs.append(x) # find highest probability for given player, store the index and probability value vals_dict = dict((i, preds[i]) for i in idxs) b = collections.defaultdict(list) for key, value in vals_dict.iteritems(): b[value].append(key) pcts[j] = max(b.items())[0] #find corresponding value of stat stats[j] = pitcher_predictions_2017[stat][max(b.items())[1][0]] vals[:,0] = stats vals[:,1] = pcts return vals, unique_names
def __init__(self, clfile='xgb_classifier_1.pickle', *args, **kwargs): """ Initialize the classifier object with optimised parameters. Parameters: clfile (str): saved classifier file. n_estimators (int): number of boosted trees in the ensemble. max_depth (int): maximum depth of each tree in the ensemble. learning_rate: boosting learning rate. reg_alpha: L1 regularization on the features. objective: learning objective of the algorithm. booster: booster used in the tree. eval_metric: Evaluation metric. .. codeauthor:: Refilwe Kgoadi <*****@*****.**> """ # Initialize the parent class: super().__init__(*args, **kwargs) # Attributes of this classifier: self.classifier = None self.classifier_file = None self.featdir = None if clfile is not None: self.classifier_file = os.path.join(self.data_dir, clfile) if self.features_cache is not None: self.featdir = os.path.join(self.features_cache, 'xgb_features') os.makedirs(self.featdir, exist_ok=True) if self.classifier_file is not None and os.path.exists( self.classifier_file): # Load pre-trained classifier self.load(self.classifier_file) self.trained = True # Assume any classifier loaded is already trained else: # Create new untrained classifier: self.classifier = xgb( booster='gbtree', colsample_bytree=0.7, eval_metric='mlogloss', gamma=7.5, learning_rate=0.1, max_depth=6, min_child_weight=1, n_estimators=500, objective='multi:softmax', random_state=self.random_seed, # XGBoost uses misleading names reg_alpha=1e-5, subsample=0.8, use_label_encoder=False) self.trained = False # List of feature names used by the classifier: self.features_names = [ 'skewness', 'kurtosis', 'shapiro_wilk', 'eta', 'PeriodLS', 'Freq_amp_0', 'Freq_ampratio_21', 'Freq_ampratio_31', 'Freq_phasediff_21', 'Freq_phasediff_31', 'Rcs', 'psi_Rcs' ]
def XGBoost(self, args): ## Gradient Boosting logger.info("Running Gradient Boosting ... ") if args.predictor.lower() == 'classifier': from xgboost import XGBClassifier as xgb if args.snps: penalty = (float( len(self.y_data[self.y_data == 0]) / len(self.y_data[self.y_data == 1]))) #np.sqrt xg_model = xgb(objective='binary:logistic', max_depth=6, colsample_bytree=0.6, scale_pos_weight=penalty) elif args.indels: penalty = float( len(self.y_data[self.y_data == 0]) / len(self.y_data[self.y_data == 1])) #np.sqr xg_model = xgb(objective='binary:logistic', n_estimators=200, eta=0.001, n_jobs=-1) elif args.predictor.lower() == 'regressor': if args.snps: penalty = (float( len(self.y_data[self.y_data == 0]) / len(self.y_data[self.y_data == 1]))) #np.sqrt from xgboost import XGBRegressor as xgb xg_model = xgb(n_estimator=40000, max_depth=6, colsample_bytree=0.6, scale_pos_weight=penalty, reg_lambda=0.001) elif args.indels: penalty = float( len(self.y_data[self.y_data == 0]) / len(self.y_data[self.y_data == 1])) #np.sqr from xgboost import XGBRegressor as xgb if penalty > 2: ##stomach xg_model = xgb(objective='binary:logistic', colsample_bytree=0.6, max_depth=8, min_child_weight=5, importance_type='gain', reg_lambda=10, subsample=0.05, min_split_loss=100) else: ##Improved: Works on GIAB, Bone, Breast, K562 xg_model = xgb(objective='binary:logistic', colsample_bytree=0.6, min_child_weight=5, importance_type='gain', max_depth=8, reg_lambda=10) ## Fit the regressor to the training set xg_model.fit(self.X_train, self.y_train) ## Predict the labels self.y_pred = xg_model.predict(self.X_data) if args.predictor.lower() == 'regressor': self.y_pred = logistic.cdf(self.y_pred) self.data['boosting_score'] = self.y_pred self.model = xg_model return self
#TODO # try the golden attribute! train.sort_values(by = 'Estimated_Insects_Count', inplace = True) test.sort_values(by = 'Estimated_Insects_Count', inplace = True) train.Number_Weeks_Used.fillna(method = 'ffill', inplace = True) test.Number_Weeks_Used.fillna(method = 'ffill', inplace = True) mms = MinMaxScaler() X = mms.fit_transform(train.ix[:, train.columns != 'Crop_Damage'].values) y = train.Crop_Damage.values X_test = mms.transform(test.values) X = X.astype('float32') X_test = X_test.astype('float32') train_x, test_x, train_y, test_y = train_test_split(X, y) print 'Training Classifiers' clf1 = xgb(nthread = 3, learning_rate = 0.3, n_estimators = 1000) cccv1 = cccv(clf1, method='isotonic', cv = 5) cccv1.fit(train_x, train_y); pred1 = cccv1.predict(test_x) print classification_report(test_y, pred1) pred = cccv1.predict(X_test) pd.DataFrame({'Crop_Damage':pred}, index=test.index).to_csv('final_sub.csv')