def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = losses.CompositeLossFunction() loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1) loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1]) loss6bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0) loss7bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) loss6knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1) loss7knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) for loss in [ loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn ]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format( result, loss) trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX)) for loss in [ losses.MSELossFunction(), losses.MAELossFunction(), losses.RankBoostLossFunction(request_column='fake_request') ]: print(loss) clf = UGradientBoostingRegressor(loss=loss, max_depth=3, n_estimators=50, learning_rate=0.01, subsample=0.5, train_features=list( trainX.columns[1:])) clf.fit(trainX, trainY) roc_auc = roc_auc_score(testY, clf.predict(testX)) assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format( roc_auc, loss)
def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6): """ Testing with two main classification losses. Also testing copying """ testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) for loss in [LogLossFunction(), AdaLossFunction()]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_features=None) clf.fit(trainX, trainY) assert clf.n_features == n_features assert len(clf.feature_importances_) == n_features # checking that predict proba works for p in clf.staged_predict_proba(testX): assert p.shape == (n_samples, 2) assert numpy.all(p == clf.predict_proba(testX)) assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low' # checking clonability _ = clone(clf) clf_copy = copy.deepcopy(clf) assert numpy.all( clf.predict_proba(trainX) == clf_copy.predict_proba( trainX)), 'copied classifier is different'
def flatnessloss(X,y,test): features = list(X.columns) features.remove('mass') loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0) clf = UGradientBoostingClassifier(loss=loss, n_estimators=300, subsample=0.7, max_depth=9, min_samples_leaf=8, learning_rate=0.1, train_features=features, random_state=11) arr = np.random.permutation(X.shape[0]) X = X.ix[arr,] y = y[arr] skf = cross_validation.StratifiedKFold(y,n_folds = 7) blend_train = np.zeros(X.shape[0]) prediction = [] blend_test_j = np.zeros((test.shape[0], len(skf))) for i,(train_index,cv_index) in enumerate(skf): print "Fold:",i X_train = X.ix[train_index,] y_train = y[train_index] X_cv = X.ix[cv_index,] #y_cv = y[cv_index] clf.fit(X_train,y_train) blend_train[cv_index] = clf.predict_proba(X_cv)[:,1] blend_test_j[:,i] = clf.predict_proba(test)[:,1] prediction = blend_test_j.mean(1) return prediction
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6): """ Testing how classifiers work with highly misbalanced (in the terms of weights) datasets. """ testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) trainW = trainY * 10000 + 1 testW = testY * 10000 + 1 for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_features=None) clf.fit(trainX, trainY, sample_weight=trainW) p = clf.predict_proba(testX) assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6): testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) for loss in [LogLossFunction(), AdaLossFunction()]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_features=None) clf.fit(trainX, trainY) assert clf.n_features == n_features assert len(clf.feature_importances_) == n_features # checking that predict proba works for p in clf.staged_predict_proba(testX): assert p.shape == (n_samples, 2) assert numpy.all(p == clf.predict_proba(testX)) assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low' # checking clonability _ = clone(clf) clf_copy = copy.deepcopy(clf) assert (clf.predict_proba(trainX) == clf_copy.predict_proba(trainX)).all(), 'copied classifier is different'
def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = losses.CompositeLossFunction() loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1) loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1]) loss6bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0) loss7bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) loss6knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1) loss7knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) for loss in [loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(result, loss) trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX)) for loss in [losses.MSELossFunction(), losses.MAELossFunction(), losses.RankBoostLossFunction(request_column='fake_request')]: print(loss) clf = UGradientBoostingRegressor(loss=loss, max_depth=3, n_estimators=50, learning_rate=0.01, subsample=0.5, train_features=list(trainX.columns[1:])) clf.fit(trainX, trainY) roc_auc = roc_auc_score(testY, clf.predict(testX)) assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format(roc_auc, loss)
def flatnessloss(X, y, test): features = list(X.columns) features.remove('mass') loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0) clf = UGradientBoostingClassifier(loss=loss, n_estimators=300, subsample=0.7, max_depth=9, min_samples_leaf=8, learning_rate=0.1, train_features=features, random_state=11) arr = np.random.permutation(X.shape[0]) X = X.ix[arr, ] y = y[arr] skf = cross_validation.StratifiedKFold(y, n_folds=7) blend_train = np.zeros(X.shape[0]) prediction = [] blend_test_j = np.zeros((test.shape[0], len(skf))) for i, (train_index, cv_index) in enumerate(skf): print "Fold:", i X_train = X.ix[train_index, ] y_train = y[train_index] X_cv = X.ix[cv_index, ] #y_cv = y[cv_index] clf.fit(X_train, y_train) blend_train[cv_index] = clf.predict_proba(X_cv)[:, 1] blend_test_j[:, i] = clf.predict_proba(test)[:, 1] prediction = blend_test_j.mean(1) return prediction
max_depth=10, max_features=6, min_samples_leaf=2) rf1.fit(train[features], train["signal"]) print("Train a UGradientBoostingClassifier") loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0) rf = UGradientBoostingClassifier(loss=loss, n_estimators=200, max_depth=6, learning_rate=0.15, train_features=features, subsample=0.7, random_state=369) rf.fit(train[features + ['mass']], train['signal']) print("Train a XGBoost model") params = { "objective": "binary:logistic", "learning_rate": 0.2, "max_depth": 6, "min_child_weight": 3, "silent": 1, "subsample": 0.7, "colsample_bytree": 0.7, "seed": 1 } num_trees = 400
print("Load the training/test data using pandas") train = pd.read_csv("../DATA/training.csv") test = pd.read_csv("../DATA/test.csv") train = mypreprocessing(train) test = mypreprocessing(test) print("Eliminate SPDhits, which makes the agreement check fail") features = list(train.columns[1:-5]) print("Train a UGradientBoostingClassifier") loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0) clf = UGradientBoostingClassifier(loss=loss, n_estimators=50, subsample=0.1, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11) clf.fit(train[features + ['mass']], train['signal']) fb_preds = clf.predict_proba(test[features])[:,1] print("Train a Random Forest model") rf = RandomForestClassifier(n_estimators=400, n_jobs=-1, criterion="entropy", random_state=1) rf.fit(train[features], train["signal"]) print("Train a XGBoost model") params = {"objective": "binary:logistic", "eta": 0.2, "max_depth": 6, "min_child_weight": 1, "silent": 1, "colsample_bytree": 0.8, "seed": 1} num_trees=250 gbm = xgb.train(params, xgb.DMatrix(train[features], train["signal"]), num_trees)
loss = BinFlatnessLossFunction(["mass"], n_bins=15, uniform_label=0) clf = UGradientBoostingClassifier( loss=loss, n_estimators=150, subsample=0.1, # n_estimators = 75 max_depth=7, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11, ) # clf = CalibratedClassifierCV(clf, method='isotonic', cv = skf) clf.fit(train[features + ["mass"]], train["signal"]) fb_preds = clf.predict_proba(test[features])[:, 1] print "saving fb" temp = pd.DataFrame({"id": test["id"], "prediction": fb_preds}) temp.to_csv("parts/fb.csv", index=False) print ("Train a Random Forest model") rf = RandomForestClassifier(n_estimators=250, n_jobs=-1, criterion="entropy", random_state=1) rf = CalibratedClassifierCV(rf, method="isotonic", cv=skf) rf.fit(train[features], train["signal"]) # used to be n_estimators=300, 375 is better, 250 could be fine print "saving rf prediction"
print("Train a Random Fores and gradient boos model model") """ gd = GradientBoostingClassifier(n_estimators=100, random_state=5,learning_rate=0.25123,subsample=0.7,max_features=34) rf = RandomForestClassifier(n_estimators=100,random_state=5) ada= AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=100,random_state=5), n_estimators=600, random_state=5,learning_rate=0.2) ada.fit(train[features],train["signal"]) rf.fit(train[features],train["signal"]) """ print("train a UBoost classifier") loss_funct=BinFlatnessLossFunction(uniform_features=["mass"],uniform_label=0,n_bins=10) ub=UGradientBoostingClassifier(loss=loss_funct,n_estimators=100, random_state=3,learning_rate=0.2,subsample=0.7) ub.fit(train[features],train["signal"]) print("train a Gradientboost classifier") gb=GradientBoostingClassifier(n_estimators=120, random_state=3,learning_rate=0.2,subsample=0.7,max_features=34) gb.fit(train[features[0:-1]],train["signal"]) print("loading aggrement data") check_agreement = pd.read_csv('C:/Users/sony/Downloads/Compressed/CERN/check_agreement.csv', index_col='id') print("calculating agreement probs") agreement_probs = 0.5*ub.predict_proba(check_agreement[features[0:-1]])[:, 1]+0.5*gb.predict_proba(check_agreement[features[0:-1]])[:, 1] ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values,
def Model1(): # Model 1 is an ensemble of XGBoost, Random Forest and Uniform Gradient Boosting Classifiers # which are trained using the stacked data model = 1 # set the model number for feature engineering n_folds = 3 # set the number of folders for generating meta-features n_stack = 15 # number of models used for stacking train, test, features = utils.LoadData( model) # load data and obtain the list of features for estimation # Initialize models for stacking clf1 = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf2 = KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf3 = KNeighborsClassifier(n_neighbors=20, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf4 = KNeighborsClassifier(n_neighbors=40, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf5 = KNeighborsClassifier(n_neighbors=80, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf6 = KNeighborsClassifier(n_neighbors=160, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf7 = KNeighborsClassifier(n_neighbors=320, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf8 = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=5.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=101, solver='lbfgs', max_iter=200, multi_class='ovr', verbose=0) clf9 = GaussianNB() clf10 = SVC(C=5.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.008, shrinking=True, probability=True, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=101) clf11 = RandomForestClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2, random_state=101, verbose=0, warm_start=False, class_weight=None) clf12 = ExtraTreesClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2, random_state=101, verbose=0, warm_start=False, class_weight=None) clf13 = GradientBoostingClassifier(loss='deviance', learning_rate=0.2, n_estimators=450, subsample=0.7, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_depth=6, init=None, random_state=101, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False) clf14 = SGDClassifier(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=10, shuffle=True, verbose=0, epsilon=0.1, n_jobs=2, random_state=101, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False) clf15 = models.XGBoostClassifier(nthread=2, eta=.2, gamma=0, max_depth=6, min_child_weight=3, max_delta_step=0, subsample=0.7, colsample_bytree=0.7, silent=1, seed=101, l2_reg=1, l1_reg=0, n_estimators=450) clfs = [ clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, clf11, clf12, clf13, clf14, clf15 ] # Construct stacked datasets train_blend, test_blend, train_probs, test_probs = utils.StackModels( train[features], test[features], train.signal.values, clfs, n_folds) # Construct data for uniform boosting columns = ['p%s ' % (i) for i in range(0, n_stack)] meta_train = pd.DataFrame( {columns[i]: train_probs[:, i] for i in range(0, n_stack)}) meta_test = pd.DataFrame( {columns[i]: test_probs[:, i] for i in range(0, n_stack)}) train_ugb = pd.concat([train, meta_train], axis=1) test_ugb = pd.concat([test, meta_test], axis=1) features_ugb = features + columns # features used for UGB training (original features + meta-features) # Initialize models for ensemble loss = BinFlatnessLossFunction(['mass'], n_bins=20, power=1, fl_coefficient=3, uniform_label=0) clf_ugb = UGradientBoostingClassifier(loss=loss, n_estimators=275, max_depth=11, min_samples_leaf=3, learning_rate=0.03, train_features=features_ugb, subsample=0.85, random_state=101) clf_xgb = models.XGBoostClassifier(nthread=6, eta=.0225, gamma=1.225, max_depth=11, min_child_weight=10, max_delta_step=0, subsample=0.8, colsample_bytree=0.3, silent=1, seed=101, l2_reg=1, l1_reg=0, n_estimators=1100) clf_rf = RandomForestClassifier(n_estimators=375, criterion='gini', max_depth=10, min_samples_split=6, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=0.6, max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4, random_state=101, verbose=0, warm_start=False, class_weight=None) # Train models print("Training a Uniform Gradient Boosting model") clf_ugb.fit(train_ugb[features_ugb + ['mass']], train_ugb['signal']) preds_ugb = clf_ugb.predict_proba(test_ugb[features_ugb])[:, 1] print("Training a XGBoost model") clf_xgb.fit(train_blend, train['signal']) preds_xgb = clf_xgb.predict_proba(test_blend) print("Training a Random Forest model") clf_rf.fit(train_blend, train['signal']) preds_rf = clf_rf.predict_proba(test_blend)[:, 1] # Compute ensemble predictions preds = 0.3 * (preds_xgb**(0.65)) * (preds_rf**(0.35)) + 0.7 * preds_ugb return preds
def Model1(): # Model 1 is an ensemble of XGBoost, Random Forest and Uniform Gradient Boosting Classifiers # which are trained using the stacked data model = 1 # set the model number for feature engineering n_folds = 3 # set the number of folders for generating meta-features n_stack = 15 # number of models used for stacking train, test, features = utils.LoadData(model) # load data and obtain the list of features for estimation # Initialize models for stacking clf1=KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf2=KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf3=KNeighborsClassifier(n_neighbors=20, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf4=KNeighborsClassifier(n_neighbors=40, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf5=KNeighborsClassifier(n_neighbors=80, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf6=KNeighborsClassifier(n_neighbors=160, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf7=KNeighborsClassifier(n_neighbors=320, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf8=LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=5.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=101, solver='lbfgs', max_iter=200, multi_class='ovr', verbose=0) clf9=GaussianNB() clf10=SVC(C=5.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.008, shrinking=True, probability=True, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=101) clf11=RandomForestClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2, random_state=101, verbose=0, warm_start=False, class_weight=None) clf12=ExtraTreesClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2, random_state=101, verbose=0, warm_start=False, class_weight=None) clf13=GradientBoostingClassifier(loss='deviance', learning_rate=0.2, n_estimators=450, subsample=0.7, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_depth=6, init=None, random_state=101, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False) clf14=SGDClassifier(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=10, shuffle=True, verbose=0, epsilon=0.1, n_jobs=2, random_state=101, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False) clf15=models.XGBoostClassifier(nthread=2, eta=.2, gamma=0, max_depth=6, min_child_weight=3, max_delta_step=0, subsample=0.7, colsample_bytree=0.7, silent =1, seed=101, l2_reg=1, l1_reg=0, n_estimators=450) clfs = [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, clf11, clf12, clf13, clf14, clf15] # Construct stacked datasets train_blend, test_blend, train_probs, test_probs = utils.StackModels(train[features], test[features], train.signal.values, clfs, n_folds) # Construct data for uniform boosting columns = ['p%s ' % (i) for i in range(0, n_stack)] meta_train = pd.DataFrame({columns[i]: train_probs[:, i] for i in range(0, n_stack)}) meta_test = pd.DataFrame({columns[i]: test_probs[:, i] for i in range(0, n_stack)}) train_ugb = pd.concat([train, meta_train], axis=1) test_ugb = pd.concat([test, meta_test], axis=1) features_ugb = features + columns # features used for UGB training (original features + meta-features) # Initialize models for ensemble loss = BinFlatnessLossFunction(['mass'], n_bins=20, power=1, fl_coefficient=3, uniform_label=0) clf_ugb = UGradientBoostingClassifier(loss=loss, n_estimators=275, max_depth=11, min_samples_leaf=3, learning_rate=0.03, train_features=features_ugb, subsample=0.85, random_state=101) clf_xgb = models.XGBoostClassifier(nthread=6, eta=.0225, gamma=1.225, max_depth=11, min_child_weight=10, max_delta_step=0, subsample=0.8, colsample_bytree=0.3, silent =1, seed=101, l2_reg=1, l1_reg=0, n_estimators=1100) clf_rf = RandomForestClassifier(n_estimators=375, criterion='gini', max_depth=10, min_samples_split=6, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=0.6, max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4, random_state=101, verbose=0, warm_start=False, class_weight=None) # Train models print("Training a Uniform Gradient Boosting model") clf_ugb.fit(train_ugb[features_ugb + ['mass']], train_ugb['signal']) preds_ugb = clf_ugb.predict_proba(test_ugb[features_ugb])[:,1] print("Training a XGBoost model") clf_xgb.fit(train_blend, train['signal']) preds_xgb = clf_xgb.predict_proba(test_blend) print("Training a Random Forest model") clf_rf.fit(train_blend, train['signal']) preds_rf = clf_rf.predict_proba(test_blend)[:,1] # Compute ensemble predictions preds = 0.3*(preds_xgb**(0.65))*(preds_rf**(0.35)) + 0.7*preds_ugb return preds