def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = losses.CompositeLossFunction()
    loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features,
                                      uniform_label=1)
    loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features,
                                      uniform_label=[0, 1])
    loss6bin = losses.BinFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=0)
    loss7bin = losses.BinFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=[0, 1])
    loss6knn = losses.KnnFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=1)
    loss7knn = losses.KnnFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=[0, 1])

    for loss in [
            loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn,
            loss7knn
    ]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(
            result, loss)

    trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX))
    for loss in [
            losses.MSELossFunction(),
            losses.MAELossFunction(),
            losses.RankBoostLossFunction(request_column='fake_request')
    ]:
        print(loss)
        clf = UGradientBoostingRegressor(loss=loss,
                                         max_depth=3,
                                         n_estimators=50,
                                         learning_rate=0.01,
                                         subsample=0.5,
                                         train_features=list(
                                             trainX.columns[1:]))
        clf.fit(trainX, trainY)
        roc_auc = roc_auc_score(testY, clf.predict(testX))
        assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format(
            roc_auc, loss)
def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6):
    """
    Testing with two main classification losses.
    Also testing copying
    """
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    for loss in [LogLossFunction(), AdaLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss,
                                          min_samples_split=20,
                                          max_depth=5,
                                          learning_rate=.2,
                                          subsample=0.7,
                                          n_estimators=10,
                                          train_features=None)
        clf.fit(trainX, trainY)
        assert clf.n_features == n_features
        assert len(clf.feature_importances_) == n_features
        # checking that predict proba works
        for p in clf.staged_predict_proba(testX):
            assert p.shape == (n_samples, 2)
        assert numpy.all(p == clf.predict_proba(testX))
        assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low'
        # checking clonability
        _ = clone(clf)
        clf_copy = copy.deepcopy(clf)
        assert numpy.all(
            clf.predict_proba(trainX) == clf_copy.predict_proba(
                trainX)), 'copied classifier is different'
示例#3
0
def flatnessloss(X,y,test):
    
    features = list(X.columns)
    features.remove('mass')
    loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)
    clf = UGradientBoostingClassifier(loss=loss, n_estimators=300, subsample=0.7, 
                                  max_depth=9, min_samples_leaf=8,
                                  learning_rate=0.1, train_features=features, random_state=11)

    
    
    arr = np.random.permutation(X.shape[0])    
    X = X.ix[arr,]
    y = y[arr]
    
    
    skf = cross_validation.StratifiedKFold(y,n_folds = 7)
    blend_train = np.zeros(X.shape[0])
    prediction = []
    blend_test_j = np.zeros((test.shape[0], len(skf)))
    
    for i,(train_index,cv_index) in enumerate(skf):
            print "Fold:",i
            X_train = X.ix[train_index,]
            y_train = y[train_index]
            X_cv = X.ix[cv_index,]
            #y_cv = y[cv_index]
            clf.fit(X_train,y_train)
            
            blend_train[cv_index] = clf.predict_proba(X_cv)[:,1]
            blend_test_j[:,i] = clf.predict_proba(test)[:,1]
    prediction = blend_test_j.mean(1)
        
    return prediction
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6):
    """
    Testing how classifiers work with highly misbalanced (in the terms of weights) datasets.
    """
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    trainW = trainY * 10000 + 1
    testW = testY * 10000 + 1
    for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                          subsample=0.7, n_estimators=10, train_features=None)
        clf.fit(trainX, trainY, sample_weight=trainW)
        p = clf.predict_proba(testX)
        assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
示例#5
0
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6):
    """
    Testing how classifiers work with highly misbalanced (in the terms of weights) datasets.
    """
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    trainW = trainY * 10000 + 1
    testW = testY * 10000 + 1
    for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                          subsample=0.7, n_estimators=10, train_features=None)
        clf.fit(trainX, trainY, sample_weight=trainW)
        p = clf.predict_proba(testX)
        assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6):
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    for loss in [LogLossFunction(), AdaLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                          subsample=0.7, n_estimators=10, train_features=None)
        clf.fit(trainX, trainY)
        assert clf.n_features == n_features
        assert len(clf.feature_importances_) == n_features
        # checking that predict proba works
        for p in clf.staged_predict_proba(testX):
            assert p.shape == (n_samples, 2)
        assert numpy.all(p == clf.predict_proba(testX))
        assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low'
        # checking clonability
        _ = clone(clf)
        clf_copy = copy.deepcopy(clf)
        assert (clf.predict_proba(trainX) == clf_copy.predict_proba(trainX)).all(), 'copied classifier is different'
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = losses.CompositeLossFunction()
    loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1)
    loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1])
    loss6bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0)
    loss7bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1])
    loss6knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1)
    loss7knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1])

    for loss in [loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(result, loss)

    trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX))
    for loss in [losses.MSELossFunction(),
                 losses.MAELossFunction(),
                 losses.RankBoostLossFunction(request_column='fake_request')]:
        print(loss)
        clf = UGradientBoostingRegressor(loss=loss, max_depth=3, n_estimators=50, learning_rate=0.01, subsample=0.5,
                                         train_features=list(trainX.columns[1:]))
        clf.fit(trainX, trainY)
        roc_auc = roc_auc_score(testY, clf.predict(testX))
        assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format(roc_auc, loss)
示例#8
0
def flatnessloss(X, y, test):

    features = list(X.columns)
    features.remove('mass')
    loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)
    clf = UGradientBoostingClassifier(loss=loss,
                                      n_estimators=300,
                                      subsample=0.7,
                                      max_depth=9,
                                      min_samples_leaf=8,
                                      learning_rate=0.1,
                                      train_features=features,
                                      random_state=11)

    arr = np.random.permutation(X.shape[0])
    X = X.ix[arr, ]
    y = y[arr]

    skf = cross_validation.StratifiedKFold(y, n_folds=7)
    blend_train = np.zeros(X.shape[0])
    prediction = []
    blend_test_j = np.zeros((test.shape[0], len(skf)))

    for i, (train_index, cv_index) in enumerate(skf):
        print "Fold:", i
        X_train = X.ix[train_index, ]
        y_train = y[train_index]
        X_cv = X.ix[cv_index, ]
        #y_cv = y[cv_index]
        clf.fit(X_train, y_train)

        blend_train[cv_index] = clf.predict_proba(X_cv)[:, 1]
        blend_test_j[:, i] = clf.predict_proba(test)[:, 1]
    prediction = blend_test_j.mean(1)

    return prediction
示例#9
0
                                 max_depth=10,
                                 max_features=6,
                                 min_samples_leaf=2)

    rf1.fit(train[features], train["signal"])
    print("Train a UGradientBoostingClassifier")
    loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)

    rf = UGradientBoostingClassifier(loss=loss,
                                     n_estimators=200,
                                     max_depth=6,
                                     learning_rate=0.15,
                                     train_features=features,
                                     subsample=0.7,
                                     random_state=369)
    rf.fit(train[features + ['mass']], train['signal'])

    print("Train a XGBoost model")
    params = {
        "objective": "binary:logistic",
        "learning_rate": 0.2,
        "max_depth": 6,
        "min_child_weight": 3,
        "silent": 1,
        "subsample": 0.7,
        "colsample_bytree": 0.7,
        "seed": 1
    }

    num_trees = 400
print("Load the training/test data using pandas")
train = pd.read_csv("../DATA/training.csv")
test  = pd.read_csv("../DATA/test.csv")

train = mypreprocessing(train)
test = mypreprocessing(test)

print("Eliminate SPDhits, which makes the agreement check fail")
features = list(train.columns[1:-5])
print("Train a UGradientBoostingClassifier")
loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)
clf = UGradientBoostingClassifier(loss=loss, n_estimators=50, subsample=0.1, 
                                  max_depth=6, min_samples_leaf=10,
                                  learning_rate=0.1, train_features=features, random_state=11)
clf.fit(train[features + ['mass']], train['signal'])
fb_preds = clf.predict_proba(test[features])[:,1]
print("Train a Random Forest model")
rf = RandomForestClassifier(n_estimators=400, n_jobs=-1, criterion="entropy", random_state=1)
rf.fit(train[features], train["signal"])

print("Train a XGBoost model")
params = {"objective": "binary:logistic",
          "eta": 0.2,
          "max_depth": 6,
          "min_child_weight": 1,
          "silent": 1,
          "colsample_bytree": 0.8,
          "seed": 1}
num_trees=250
gbm = xgb.train(params, xgb.DMatrix(train[features], train["signal"]), num_trees)
loss = BinFlatnessLossFunction(["mass"], n_bins=15, uniform_label=0)
clf = UGradientBoostingClassifier(
    loss=loss,
    n_estimators=150,
    subsample=0.1,  # n_estimators = 75
    max_depth=7,
    min_samples_leaf=10,
    learning_rate=0.1,
    train_features=features,
    random_state=11,
)


# clf = CalibratedClassifierCV(clf, method='isotonic', cv = skf)

clf.fit(train[features + ["mass"]], train["signal"])

fb_preds = clf.predict_proba(test[features])[:, 1]
print "saving fb"
temp = pd.DataFrame({"id": test["id"], "prediction": fb_preds})
temp.to_csv("parts/fb.csv", index=False)


print ("Train a Random Forest model")
rf = RandomForestClassifier(n_estimators=250, n_jobs=-1, criterion="entropy", random_state=1)

rf = CalibratedClassifierCV(rf, method="isotonic", cv=skf)

rf.fit(train[features], train["signal"])  # used to be n_estimators=300, 375 is better, 250 could be fine

print "saving rf prediction"
print("Train a Random Fores and gradient boos model model")
"""
gd = GradientBoostingClassifier(n_estimators=100, random_state=5,learning_rate=0.25123,subsample=0.7,max_features=34)


rf = RandomForestClassifier(n_estimators=100,random_state=5)
ada= AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=100,random_state=5),
                        n_estimators=600, random_state=5,learning_rate=0.2)
ada.fit(train[features],train["signal"])
rf.fit(train[features],train["signal"])

"""
print("train a UBoost classifier")
loss_funct=BinFlatnessLossFunction(uniform_features=["mass"],uniform_label=0,n_bins=10)
ub=UGradientBoostingClassifier(loss=loss_funct,n_estimators=100, random_state=3,learning_rate=0.2,subsample=0.7)
ub.fit(train[features],train["signal"])

print("train a Gradientboost classifier")
gb=GradientBoostingClassifier(n_estimators=120, random_state=3,learning_rate=0.2,subsample=0.7,max_features=34)
gb.fit(train[features[0:-1]],train["signal"])

print("loading aggrement data")
check_agreement = pd.read_csv('C:/Users/sony/Downloads/Compressed/CERN/check_agreement.csv', index_col='id')

print("calculating agreement probs")
agreement_probs = 0.5*ub.predict_proba(check_agreement[features[0:-1]])[:, 1]+0.5*gb.predict_proba(check_agreement[features[0:-1]])[:, 1] 

ks = evaluation.compute_ks(
    agreement_probs[check_agreement['signal'].values == 0],
    agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
def Model1():

    # Model 1 is an ensemble of XGBoost, Random Forest and Uniform Gradient Boosting Classifiers
    # which are trained using the stacked data

    model = 1  # set the model number for feature engineering
    n_folds = 3  # set the number of folders for generating meta-features
    n_stack = 15  # number of models used for stacking

    train, test, features = utils.LoadData(
        model)  # load data and obtain the list of features for estimation

    # Initialize models for stacking

    clf1 = KNeighborsClassifier(n_neighbors=5,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf2 = KNeighborsClassifier(n_neighbors=10,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf3 = KNeighborsClassifier(n_neighbors=20,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf4 = KNeighborsClassifier(n_neighbors=40,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf5 = KNeighborsClassifier(n_neighbors=80,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf6 = KNeighborsClassifier(n_neighbors=160,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf7 = KNeighborsClassifier(n_neighbors=320,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf8 = LogisticRegression(penalty='l2',
                              dual=False,
                              tol=0.0001,
                              C=5.0,
                              fit_intercept=True,
                              intercept_scaling=1,
                              class_weight=None,
                              random_state=101,
                              solver='lbfgs',
                              max_iter=200,
                              multi_class='ovr',
                              verbose=0)

    clf9 = GaussianNB()

    clf10 = SVC(C=5.0,
                kernel='rbf',
                degree=3,
                gamma=0.0,
                coef0=0.008,
                shrinking=True,
                probability=True,
                tol=0.001,
                cache_size=200,
                class_weight=None,
                verbose=False,
                max_iter=-1,
                random_state=101)

    clf11 = RandomForestClassifier(n_estimators=250,
                                   criterion='gini',
                                   max_depth=6,
                                   min_samples_split=2,
                                   min_samples_leaf=5,
                                   min_weight_fraction_leaf=0.0,
                                   max_features=0.7,
                                   max_leaf_nodes=None,
                                   bootstrap=False,
                                   oob_score=False,
                                   n_jobs=2,
                                   random_state=101,
                                   verbose=0,
                                   warm_start=False,
                                   class_weight=None)

    clf12 = ExtraTreesClassifier(n_estimators=250,
                                 criterion='gini',
                                 max_depth=6,
                                 min_samples_split=2,
                                 min_samples_leaf=5,
                                 min_weight_fraction_leaf=0.0,
                                 max_features=0.7,
                                 max_leaf_nodes=None,
                                 bootstrap=False,
                                 oob_score=False,
                                 n_jobs=2,
                                 random_state=101,
                                 verbose=0,
                                 warm_start=False,
                                 class_weight=None)

    clf13 = GradientBoostingClassifier(loss='deviance',
                                       learning_rate=0.2,
                                       n_estimators=450,
                                       subsample=0.7,
                                       min_samples_split=2,
                                       min_samples_leaf=5,
                                       min_weight_fraction_leaf=0.0,
                                       max_depth=6,
                                       init=None,
                                       random_state=101,
                                       max_features=None,
                                       verbose=0,
                                       max_leaf_nodes=None,
                                       warm_start=False)

    clf14 = SGDClassifier(loss='log',
                          penalty='l2',
                          alpha=0.0001,
                          l1_ratio=0.15,
                          fit_intercept=True,
                          n_iter=10,
                          shuffle=True,
                          verbose=0,
                          epsilon=0.1,
                          n_jobs=2,
                          random_state=101,
                          learning_rate='optimal',
                          eta0=0.0,
                          power_t=0.5,
                          class_weight=None,
                          warm_start=False,
                          average=False)

    clf15 = models.XGBoostClassifier(nthread=2,
                                     eta=.2,
                                     gamma=0,
                                     max_depth=6,
                                     min_child_weight=3,
                                     max_delta_step=0,
                                     subsample=0.7,
                                     colsample_bytree=0.7,
                                     silent=1,
                                     seed=101,
                                     l2_reg=1,
                                     l1_reg=0,
                                     n_estimators=450)

    clfs = [
        clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, clf11,
        clf12, clf13, clf14, clf15
    ]

    # Construct stacked datasets
    train_blend, test_blend, train_probs, test_probs = utils.StackModels(
        train[features], test[features], train.signal.values, clfs, n_folds)

    # Construct data for uniform boosting
    columns = ['p%s ' % (i) for i in range(0, n_stack)]
    meta_train = pd.DataFrame(
        {columns[i]: train_probs[:, i]
         for i in range(0, n_stack)})
    meta_test = pd.DataFrame(
        {columns[i]: test_probs[:, i]
         for i in range(0, n_stack)})
    train_ugb = pd.concat([train, meta_train], axis=1)
    test_ugb = pd.concat([test, meta_test], axis=1)
    features_ugb = features + columns  # features used for UGB training (original features + meta-features)

    # Initialize models for ensemble
    loss = BinFlatnessLossFunction(['mass'],
                                   n_bins=20,
                                   power=1,
                                   fl_coefficient=3,
                                   uniform_label=0)

    clf_ugb = UGradientBoostingClassifier(loss=loss,
                                          n_estimators=275,
                                          max_depth=11,
                                          min_samples_leaf=3,
                                          learning_rate=0.03,
                                          train_features=features_ugb,
                                          subsample=0.85,
                                          random_state=101)

    clf_xgb = models.XGBoostClassifier(nthread=6,
                                       eta=.0225,
                                       gamma=1.225,
                                       max_depth=11,
                                       min_child_weight=10,
                                       max_delta_step=0,
                                       subsample=0.8,
                                       colsample_bytree=0.3,
                                       silent=1,
                                       seed=101,
                                       l2_reg=1,
                                       l1_reg=0,
                                       n_estimators=1100)

    clf_rf = RandomForestClassifier(n_estimators=375,
                                    criterion='gini',
                                    max_depth=10,
                                    min_samples_split=6,
                                    min_samples_leaf=1,
                                    min_weight_fraction_leaf=0.0,
                                    max_features=0.6,
                                    max_leaf_nodes=None,
                                    bootstrap=True,
                                    oob_score=False,
                                    n_jobs=4,
                                    random_state=101,
                                    verbose=0,
                                    warm_start=False,
                                    class_weight=None)

    # Train models
    print("Training a Uniform Gradient Boosting model")
    clf_ugb.fit(train_ugb[features_ugb + ['mass']], train_ugb['signal'])
    preds_ugb = clf_ugb.predict_proba(test_ugb[features_ugb])[:, 1]

    print("Training a XGBoost model")
    clf_xgb.fit(train_blend, train['signal'])
    preds_xgb = clf_xgb.predict_proba(test_blend)

    print("Training a Random Forest model")
    clf_rf.fit(train_blend, train['signal'])
    preds_rf = clf_rf.predict_proba(test_blend)[:, 1]

    # Compute ensemble predictions
    preds = 0.3 * (preds_xgb**(0.65)) * (preds_rf**(0.35)) + 0.7 * preds_ugb

    return preds
def Model1():
    
# Model 1 is an ensemble of XGBoost, Random Forest and Uniform Gradient Boosting Classifiers
# which are trained using the stacked data    

    model = 1    # set the model number for feature engineering
    n_folds = 3 # set the number of folders for generating meta-features
    n_stack = 15  # number of models used for stacking
    
    train, test, features = utils.LoadData(model)  # load data and obtain the list of features for estimation
    
    # Initialize models for stacking
        
    clf1=KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30,
                              p=2, metric='minkowski', metric_params=None)
                          
    clf2=KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=30, 
                              p=2, metric='minkowski', metric_params=None)
                          
    clf3=KNeighborsClassifier(n_neighbors=20, weights='uniform', algorithm='auto', leaf_size=30,
                              p=2, metric='minkowski', metric_params=None)  
                          
    clf4=KNeighborsClassifier(n_neighbors=40, weights='uniform', algorithm='auto', leaf_size=30, 
                              p=2, metric='minkowski', metric_params=None)
                          
    clf5=KNeighborsClassifier(n_neighbors=80, weights='uniform', algorithm='auto', leaf_size=30, 
                              p=2, metric='minkowski', metric_params=None) 

    clf6=KNeighborsClassifier(n_neighbors=160, weights='uniform', algorithm='auto', leaf_size=30,  
                              p=2, metric='minkowski', metric_params=None)

    clf7=KNeighborsClassifier(n_neighbors=320, weights='uniform', algorithm='auto', leaf_size=30,
                              p=2, metric='minkowski', metric_params=None)                          
                          
    clf8=LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=5.0, fit_intercept=True,
                            intercept_scaling=1, class_weight=None, random_state=101, solver='lbfgs', 
                            max_iter=200, multi_class='ovr', verbose=0) 
                        
    clf9=GaussianNB()
                 
    clf10=SVC(C=5.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.008, shrinking=True, probability=True, 
              tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=101)
               
    clf11=RandomForestClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2, 
                            min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, 
                            max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2,
                            random_state=101, verbose=0, warm_start=False, class_weight=None) 
                            
    clf12=ExtraTreesClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2,
                     min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7,
                     max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2, 
                     random_state=101, verbose=0, warm_start=False, class_weight=None)

    clf13=GradientBoostingClassifier(loss='deviance', learning_rate=0.2, n_estimators=450, subsample=0.7, 
                                min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0,
                                max_depth=6, init=None, random_state=101, max_features=None, verbose=0,
                                max_leaf_nodes=None, warm_start=False)
                                
    clf14=SGDClassifier(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True,
                        n_iter=10, shuffle=True, verbose=0, epsilon=0.1, n_jobs=2, random_state=101, 
                        learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False,
                        average=False) 

    clf15=models.XGBoostClassifier(nthread=2, eta=.2, gamma=0, max_depth=6, min_child_weight=3, max_delta_step=0,
                         subsample=0.7, colsample_bytree=0.7, silent =1, seed=101,
                         l2_reg=1, l1_reg=0, n_estimators=450)
                         
                               
    clfs = [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, clf11, clf12, clf13, clf14, clf15]    
        
    # Construct stacked datasets
    train_blend, test_blend, train_probs, test_probs = utils.StackModels(train[features], test[features], 
                                                                         train.signal.values, clfs, n_folds)                                                                                      
                                                                             
    # Construct data for uniform boosting
    columns = ['p%s ' % (i) for i in range(0, n_stack)]
    meta_train = pd.DataFrame({columns[i]: train_probs[:, i] for i in range(0, n_stack)})
    meta_test = pd.DataFrame({columns[i]: test_probs[:, i] for i in range(0, n_stack)})
    train_ugb = pd.concat([train, meta_train], axis=1)
    test_ugb = pd.concat([test, meta_test], axis=1)
    features_ugb = features + columns               # features used for UGB training (original features + meta-features)

    # Initialize models for ensemble
    loss = BinFlatnessLossFunction(['mass'], n_bins=20, power=1, fl_coefficient=3, uniform_label=0)
                                   
    clf_ugb = UGradientBoostingClassifier(loss=loss, n_estimators=275, max_depth=11, min_samples_leaf=3, 
                            learning_rate=0.03, train_features=features_ugb, subsample=0.85, random_state=101)  
                            
    clf_xgb = models.XGBoostClassifier(nthread=6, eta=.0225, gamma=1.225, max_depth=11, min_child_weight=10, 
                                max_delta_step=0, subsample=0.8, colsample_bytree=0.3,  
                                silent =1, seed=101, l2_reg=1, l1_reg=0, n_estimators=1100)
                                
    clf_rf = RandomForestClassifier(n_estimators=375, criterion='gini', max_depth=10, min_samples_split=6, 
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=0.6, 
                                max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4,
                                random_state=101, verbose=0, warm_start=False, class_weight=None)

    # Train models
    print("Training a Uniform Gradient Boosting model")     
    clf_ugb.fit(train_ugb[features_ugb + ['mass']], train_ugb['signal'])   
    preds_ugb = clf_ugb.predict_proba(test_ugb[features_ugb])[:,1]
    
    print("Training a XGBoost model")     
    clf_xgb.fit(train_blend, train['signal'])
    preds_xgb = clf_xgb.predict_proba(test_blend)
        
    print("Training a Random Forest model") 
    clf_rf.fit(train_blend, train['signal'])
    preds_rf = clf_rf.predict_proba(test_blend)[:,1]
        
    # Compute ensemble predictions
    preds = 0.3*(preds_xgb**(0.65))*(preds_rf**(0.35)) + 0.7*preds_ugb
    
    return preds