Exemplo n.º 1
0
def ctr_gbdt(model='sklearn-clicklog', from_cache=False, train_dataset_length=100000, test_dataset_length=100000):
    TRAIN_FILE, TEST_FILE = create_dataset(model, from_cache, train_dataset_length, test_dataset_length)

    prediction_model = GradientBoostingClassifier(
        loss='deviance',
        learning_rate=0.1,
        n_estimators=30,
        subsample=1.0,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_depth=5,
    )

    x_train, y_train = clean_data(TRAIN_FILE)
    x_test, y_test = clean_data(TEST_FILE)

    with Timer('fit model'):
        prediction_model.fit(x_train, y_train)

    with Timer('evaluate model'):
        y_prediction_train = prediction_model.predict_proba(x_train)
        y_prediction_test = prediction_model.predict_proba(x_test)

        loss_train = log_loss(y_train, y_prediction_train)
        loss_test = log_loss(y_test, y_prediction_test)

    print 'loss_train: %s' % loss_train
    print 'loss_test: %s' % loss_test
Exemplo n.º 2
0
def ensembleGBM(derived_data_path, X_train, Y_train, X_test, seed=60):
    random.seed(seed)
    GBM1 = GradientBoostingClassifier(n_estimators = 1500, learning_rate = 0.008, min_samples_leaf = 5, max_features=0.2, max_depth=7)
    GBM2 = GradientBoostingClassifier(n_estimators = 1700, learning_rate = 0.007, min_samples_leaf = 5, max_features=0.2, max_depth=7)
    GBM3 = GradientBoostingClassifier(n_estimators = 1600, learning_rate = 0.0075, min_samples_leaf = 5, max_features=0.2, max_depth=7)
    GBM4 = GradientBoostingClassifier(n_estimators = 1650, learning_rate = 0.007, min_samples_leaf = 5, max_features=0.2, max_depth=8)
    GBM5 = GradientBoostingClassifier(n_estimators = 1750, learning_rate = 0.00725, min_samples_leaf = 6, max_features=0.2, max_depth=7)
    GBM6 = GradientBoostingClassifier(n_estimators = 1550, learning_rate = 0.00775, min_samples_leaf = 4, max_features=0.2, max_depth=7)
    GBM7 = GradientBoostingClassifier(n_estimators = 1850, learning_rate = 0.00725, min_samples_leaf = 5, max_features=0.2, max_depth=6)

    print "Running Model 1"
    GBM1.fit(X_train, Y_train)
    print "Running Model 2"
    GBM2.fit(X_train, Y_train)
    print "Running Model 3"
    GBM3.fit(X_train, Y_train)
    print "Running Model 4"
    GBM4.fit(X_train, Y_train)
    print "Running Model 5"
    GBM5.fit(X_train, Y_train)
    print "Running Model 6"
    GBM6.fit(X_train, Y_train)
    print "Running Model 7"
    GBM7.fit(X_train, Y_train)
    
    GBMClassifiers = [GBM1, GBM2, GBM3, GBM4, GBM5, GBM6, GBM7]
    saveObject(derived_data_path, 'GBM_classifiers.obj', GBMClassifiers)
    
    combine = float(1)/7*(GBM1.predict_proba(X_test)[:,1] + GBM2.predict_proba(X_test)[:,1] + GBM3.predict_proba(X_test)[:,1] +GBM4.predict_proba(X_test)[:,1] +GBM5.predict_proba(X_test)[:,1] + GBM6.predict_proba(X_test)[:,1] + GBM7.predict_proba(X_test)[:,1])

    return combine
Exemplo n.º 3
0
def predict(fea, df, t, t9):
    Un = df.columns == 'Blank'
    for f in Fea:
        '''        
        try:
            df[(f+'_y')] = df[(f+'_x')] - df[(f+'_y')]
            print(1)
        except:
            pass
        '''
        Un = Un | (df.columns == f)
        Un = Un | (df.columns == (f+'_x'))
        Un = Un | (df.columns == (f+'_y'))
    Un = Un & (df.columns != 'New_y')    
    clf = GradientBoostingClassifier()
    y = df[t].label
    X = df[t].ix[:,Un]
    X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.9, random_state = 1)
    clf.fit(X_train, y_train)
    re = 'Testing AUC: \t' + str(roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))  
    print re
    re =  'September AUC: \t' + str(roc_auc_score(df[t9].label,clf.predict_proba(df[t9].ix[:,Un])[:,1]))
    print re
    print(X.columns)
    print(clf.feature_importances_)
    return Un, clf
Exemplo n.º 4
0
def ensembleGBMTest(derived_data_path, X_train, Y_train, X_test, Y_test):
    random.seed(60)
    GBM1 = GradientBoostingClassifier(n_estimators = 1500, learning_rate = 0.008, min_samples_leaf = 5, max_features=0.2, max_depth=7)
    GBM2 = GradientBoostingClassifier(n_estimators = 1700, learning_rate = 0.007, min_samples_leaf = 5, max_features=0.2, max_depth=7)
    GBM3 = GradientBoostingClassifier(n_estimators = 1600, learning_rate = 0.0075, min_samples_leaf = 5, max_features=0.2, max_depth=7)
    GBM4 = GradientBoostingClassifier(n_estimators = 1650, learning_rate = 0.007, min_samples_leaf = 5, max_features=0.2, max_depth=8)
    GBM5 = GradientBoostingClassifier(n_estimators = 1750, learning_rate = 0.00725, min_samples_leaf = 6, max_features=0.2, max_depth=7)
    GBM6 = GradientBoostingClassifier(n_estimators = 1550, learning_rate = 0.00775, min_samples_leaf = 4, max_features=0.2, max_depth=7)
    GBM7 = GradientBoostingClassifier(n_estimators = 1850, learning_rate = 0.00725, min_samples_leaf = 5, max_features=0.2, max_depth=6)

    GBM1.fit(X_train, Y_train)
    GBM2.fit(X_train, Y_train)
    GBM3.fit(X_train, Y_train)
    GBM4.fit(X_train, Y_train)
    GBM5.fit(X_train, Y_train)
    GBM6.fit(X_train, Y_train)
    GBM7.fit(X_train, Y_train)
    
    print "GBM1: %f" % (gini(GBM1, X_test, Y_test))
    print "GBM2: %f" % (gini(GBM2, X_test, Y_test))
    print "GBM3: %f" % (gini(GBM3, X_test, Y_test))
    print "GBM4: %f" % (gini(GBM4, X_test, Y_test))
    print "GBM5: %f" % (gini(GBM5, X_test, Y_test))
    print "GBM6: %f" % (gini(GBM6, X_test, Y_test))
    print "GBM7: %f" % (gini(GBM7, X_test, Y_test))
    
    #now combine!
    combine = GBM1.predict_proba(X_test)[:,1] + GBM2.predict_proba(X_test)[:,1] + GBM3.predict_proba(X_test)[:,1] +GBM4.predict_proba(X_test)[:,1] +GBM5.predict_proba(X_test)[:,1] 
    combine = combine + GBM6.predict_proba(X_test)[:,1] + GBM7.predict_proba(X_test)[:,1]
    print "With our powers combined: %f" % (giniNoEstimator(Y_test, combine))

    GBMClassifiers = [GBM1, GBM2, GBM3, GBM4, GBM5, GBM6, GBM7]
    saveObject(derived_data_path, 'GBM_classifiers.obj', GBMClassifiers)
class TestGradientBoostingClassifierConverter(TestCase):
    def setUp(self):
        np.random.seed(1)
        self.est = GradientBoostingClassifier(max_depth=2, n_estimators=10)
        self.est.fit([[0, 0], [0, 1], [1, 0], [1, 1]], [0, 1, 1, 1])
        self.ctx = TransformationContext(
            {
                Schema.INPUT: [IntegerNumericFeature("x1"), StringCategoricalFeature("x2", ["zero", "one"])],
                Schema.MODEL: [IntegerNumericFeature("x1"), StringCategoricalFeature("x2", ["zero", "one"])],
                Schema.DERIVED: [],
                Schema.OUTPUT: [IntegerCategoricalFeature("output", [0, 1])],
            }
        )
        self.converter = GradientBoostingConverter(estimator=self.est, context=self.ctx)

    def test_transform(self):
        p = self.converter.pmml()
        mm = p.MiningModel[0]
        assert mm.MiningSchema is not None, "Missing mining schema"
        assert len(mm.MiningSchema.MiningField) == 2, "Wrong number of mining fields"
        assert mm.Segmentation is not None, "Missing segmentation root"

    def test_transform_with_verification(self):
        p = self.converter.pmml(
            [
                {"x1": 0, "x2": "zero", "output": self.est.predict_proba([[0, 0]])[0, 1]},
                {"x1": 0, "x2": "one", "output": self.est.predict_proba([[0, 1]])[0, 1]},
                {"x1": 1, "x2": "zero", "output": self.est.predict_proba([[1, 0]])[0, 1]},
                {"x1": 1, "x2": "one", "output": self.est.predict_proba([[1, 1]])[0, 1]},
            ]
        )
        mm = p.MiningModel[0]
        assert mm.MiningSchema is not None, "Missing mining schema"
        assert len(mm.MiningSchema.MiningField) == 2, "Wrong number of mining fields"
        assert mm.Segmentation is not None, "Missing segmentation root"
def GB_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS):
    print("***************Starting Gradient Boosting***************")
    t0 = time()
    clf = GradientBoostingClassifier(n_estimators=500,learning_rate=0.01)
    clf.fit(X_train, Y_train)
    preds = clf.predict(X_cv)
    score = clf.score(X_cv,Y_cv)

    print("Gradient Boosting - {0:.2f}%".format(100 * score))
    Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds),
                      rownames=['actual'], colnames=['preds'])
    Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100
    print(Summary)

    #Check with log loss function
    epsilon = 1e-15
    #ll_output = log_loss_func(Y_cv, preds, epsilon)
    preds2 = clf.predict_proba(X_cv)
    ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True)
    print(ll_output2)

    print("done in %0.3fs" % (time() - t0))

    preds3 = clf.predict_proba(X_test)
    #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':]))
    preds4 = clf.predict_proba(Actual_DS)

    print("***************Ending Gradient Boosting***************")
    return pd.DataFrame(preds2),pd.DataFrame(preds3),pd.DataFrame(preds4)
Exemplo n.º 7
0
def predict(fea1,fea2, df, t, t9):
    n = 0
    weight = [0.73,0.27]
    tave = np.zeros(len(df[t9]))
    y = df[t].label
    X_1 = df[t]
    df9 = df[t9]
    for fea in [fea1,fea2]:
        Un = df.columns == 'Blank'
        for f in fea:
            Un = Un | (df.columns == f)
            Un = Un | (df.columns == (f+'_x'))
            Un = Un | (df.columns == (f+'_y'))
        Un = Un & (df.columns != 'quarterly_attrition_rate_y')
        clf = GradientBoostingClassifier()
        X = X_1.ix[:,Un]
        X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.9, random_state = 1)  
        min_max_scaler = preprocessing.MinMaxScaler()
        clf.fit(min_max_scaler.fit_transform(X_train), y_train)
        re = 'Testing AUC: \t' + str(roc_auc_score(y_test,clf.predict_proba(min_max_scaler.transform(X_test))[:,1]))
        print re
        t = clf.predict_proba(min_max_scaler.fit_transform(df9.ix[:,Un]))[:,1]
        re =  'September AUC: \t' + str(roc_auc_score(df9.label,t))
        print re
        tave = t * weight[n] + tave
        n += 1
        
    
    print '-' * 30
    print(weight)
    print 'Total AUC'
    re =  'September AUC: \t' + str(roc_auc_score(df9.label,tave))
    print re
    return Un, clf
Exemplo n.º 8
0
def gbdt_solver(train_data, train_label, validation, test, unlabel, dimreduce=decomposition.undo):
    """
    """
    # train_data = train_data[:100,:]
    # train_label = train_label[:100]

    logging.info("begin to train the gbdt classifier")
    new_train_data, new_val, new_test, new_unlabel = dimreduce(train_data, train_label, validation, test, unlabel)
    logging.info("finished feature extracting")

    """
    gb = GradientBoostingClassifier ()
    params_gbdt = {"n_estimators":[100,200,500,1000],
                 "learning_rate":[0.02,0.03,0.05,0.1],
                 "max_depth":[3,5,7,9],
                 "random_state":[1000000007]}"""

    # rand_search_result = GridSearchCV (gb, param_grid = params_gbdt , n_jobs = 3  , cv = 3, scoring = 'roc_auc')
    # rand_search_result = RandomizedSearchCV (gb, param_distributions = params_gbdt, n_jobs = 3, cv = 3, n_iter = 100, scoring = 'roc_auc')
    # rand_search_result.fit (new_train_data , train_label)
    # params = tools.report (rand_search_result.grid_scores_)

    params = {
        "n_estimators": 600,
        "learning_rate": 0.03,
        "random_state": 1000000007,
        "max_depth": 2,
        "warm_start": True,
    }
    gb = GradientBoostingClassifier(**params)
    gb.fit(new_train_data, train_label)
    joblib.dump(gb, ROOT + "/result/gbdt.pkl")
    evaluate.get_auc(gb.predict_proba(new_val)[:, 1])
    return gb.predict_proba(new_test)[:, 1]
Exemplo n.º 9
0
def main():
    
    train_f = pd.read_csv(train_path, header=0, parse_dates=['Dates'])
    print train_f.dtypes

    X, Y = get_feature(train_f, "training_set")
    

    ### TRAINING
    clf = GradientBoostingClassifier(n_estimators=50)
    # clf = RandomForestClassifier(n_estimators=2)
    # clf = LogisticRegression(n_jobs=4)

    X, Y = shuffle_XY(X, Y)
    data_len = len(X)
    train_len = data_len * 95 / 100 
    val_len = data_len - train_len
    X_train = X[:train_len]
    X_val = X[train_len:]
    Y_train = Y[:train_len]
    Y_val = Y[train_len:]
    
    clf = clf.fit(X_train, Y_train)
    print "Training done"

    
    val_acc = clf.score(X_val, Y_val)
    print "Val acc:", val_acc

    val_pred = clf.predict_proba(X_val)
    

    # print max(Y_val), min(Y_val)
    # print Y_val, Y_val + 1
    val_log = 0.0
    cnt = 0
    for y in Y_val:
        val_log += math.log(val_pred[cnt, y]+0.0000001)
        cnt += 1
    val_log =  - val_log / len(Y_val)
    print "Val log loss:", val_log
 
    # print "Val loss:", log_loss(Y_val+1, val_pred) # Note the +1 here!
    """
    # scores = cross_val_score(clf, X, Y)
    # print "Cross val acc:", scores.mean()
    """

    ### Testing

    test_f = pd.read_csv(test_path, header=0, parse_dates=['Dates'])
    # print test_f.dtypes

    X_test, _ = get_feature(test_f, "test_set")
    Y_test = clf.predict_proba(X_test)

    ### Write results
    # write_results(Y_test)
    write_results_prob(Y_test)
Exemplo n.º 10
0
class MyGradientBoost(MyClassifier):
    def __init__(self, params=dict()):
        self._params = params
        self._gb = GradientBoostingClassifier(**(self._params))

    def update_params(self, updates):
        self._params.update(updates)
        self._gb = GradientBoostingClassifier(**(self._params))

    def fit(self, Xtrain, ytrain):
        self._gb.fit(Xtrain, ytrain)

    # def predict(self, Xtest, option = None):
    #   return self._gb.predict(Xtest)

    def predict_proba(self, Xtest, option = None):
        return self._gb.predict_proba(Xtest)[:, 1]

    def predict_proba_multi(self, Xtest, option = None):
        return self._gb.predict_proba(Xtest)

    def plt_feature_importance(self, fname_list, f_range = list()):
        importances = self._gb.feature_importances_

        std = np.std([tree[0].feature_importances_ for tree in self._gb.estimators_], axis=0)
        indices = np.argsort(importances)[::-1]

        fname_array = np.array(fname_list)

        if not f_range:
            f_range = range(indices.shape[0])

        n_f = len(f_range)

        plt.figure()
        plt.title("Gradient Boost Feature importances")
        plt.barh(range(n_f), importances[indices[f_range]],
               color="b", xerr=std[indices[f_range]], ecolor='k',align="center")
        plt.yticks(range(n_f), fname_array[indices[f_range]])
        plt.ylim([-1, n_f])
        plt.show()    

    def list_feature_importance(self, fname_list, f_range = list(), return_list = False):
        importances = self._gb.feature_importances_
        indices = np.argsort(importances)[::-1]

        print 'Gradient Boost feature ranking:'

        if not f_range :
            f_range = range(indices.shape[0])

        n_f = len(f_range)

        for i in range(n_f):
            f = f_range[i]
            print '{0:d}. feature[{1:d}]  {2:s}  ({3:f})'.format(f + 1, indices[f], fname_list[indices[f]], importances[indices[f]])

        if return_list:
            return [indices[f_range[i]] for i in range(n_f)]
def do_gbdt4(train_x, train_y, test_x=None, test_y=None, learning_rate=0.03, max_depth=8, max_features=25,
            n_estimators=600, load=False, save=True, outfile=None, search=False, log=False):
    if search == False:
        if log==True:
            mdl_name = 'gbdt_log_train_lr' + str(learning_rate) + '_n' + str(n_estimators) + '_maxdep' + str(max_depth) + '.pkl'
        else:
            mdl_name = 'gbdt_train_lr' + str(learning_rate) + '_n' + str(n_estimators) + '_maxdep' + str(max_depth) + '.pkl'
        if os.path.exists(mdl_name) == True:
            clf_gbdt = joblib.load(mdl_name)
        else:
            # create gradient boosting
            clf_gbdt = GradientBoostingClassifier(learning_rate=learning_rate, max_depth=max_depth,
                                                  max_features=max_features, n_estimators=n_estimators)
            #n_estimators=500, learning_rate=0.5, max_depth=3)
            clf_gbdt.fit(train_x, train_y)
            if save == True:
                try:
                    _ = joblib.dump(clf_gbdt, mdl_name, compress=1)
                except:
                    print("*** Save GBM model to pickle failed!!!")
                    if outfile != None:
                        outfile.write("*** Save RF model to pickle failed!!!")
        if test_x != None and test_y != None:
            probas_gbdt = clf_gbdt.predict_proba(test_x)[:, 1]
            score_gbdt = roc_auc_score(test_y, probas_gbdt)
            print("GBDT ROC score", score_gbdt)
        return clf_gbdt
    else:
        max_depth_list = [ 6, 7, 8, 9, 10]
        n_list = [2000]
        lr_list = [0.005,0.003]
        max_feat_list = [15, 16, 17, 18, 20]
        info = {}
        for md in max_depth_list:
            for n in n_list:
                for lr in lr_list:
                  for mf in max_feat_list:
                    print 'max_depth = ', md
                    print 'n = ', n
                    print 'learning rate = ', lr
                    print 'max feature = ', mf
                    # n_estimators=500, learning_rate=0.5, max_depth=3)
                    mdl_name = 'gbdt_n'+str(n)+'_lr'+str(lr)+'_md'+str(md)+'mf'+str(mf)+'.pkl'
                    if os.path.exists(mdl_name) == True:
                        clf_gbdt = joblib.load(mdl_name)        
                    else:
                        clf_gbdt = GradientBoostingClassifier(learning_rate=learning_rate, max_depth=md,max_features=mf, n_estimators=n_estimators)
                        clf_gbdt.fit(train_x, train_y)
                        _ = joblib.dump(clf_gbdt, mdl_name, compress=1)
                    probas_gbdt = clf_gbdt.predict_proba(test_x)[:, 1]
                    score_gbdt = roc_auc_score(test_y, probas_gbdt)
                    info[md, n, lr, mf] = score_gbdt
        for md in info:
            scores = info[md]
            print('GBDT max_depth = %d, n = %d, lr = %.5f, max_feature = %d, ROC score = %.5f(%.5f)' % (
                md[0], md[1], md[2], md[3], scores.mean(), scores.std()))
Exemplo n.º 12
0
def gb(train_data,train_label,val_data,val_label,test_data,name="GradientBoosting_submission.csv"):
	print "start training GradientBoosting..."
	gbClf = GradientBoostingClassifier()       # params: by default
	gbClf.fit(train_data,train_label)
	#evaluate on validation set
	val_pred_label = gbClf.predict_proba(val_data)
	logloss = preprocess.evaluation(val_label,val_pred_label)
	print "logloss of validation set:",logloss

	print "Start classify test set..."
	test_label = gbClf.predict_proba(test_data)
	preprocess.saveResult(test_label,filename = name)
Exemplo n.º 13
0
def gb_predictedValue():
    print '----------GradientBoosting----------'
    gb_clf = GradientBoostingClassifier(n_estimators = NoOfEstimators)
    gb_clf.fit(train_df[features], train_df['SeriousDlqin2yrs'])
    gb_predictedValue = gb_clf.predict_proba(test_df[features])
    print 'Feature Importance = %s' % gb_clf.feature_importances_
    return gb_predictedValue[:,1]
Exemplo n.º 14
0
def machineLearning(X, Y_parameters,  predict_value, writer):
    X_parameters = X
    clf1 = LinearSVR()
    clf2 = LinearRegression()
    clf3 = RandomForestClassifier()
    clf4 = LogisticRegression()
    clf5 = DecisionTreeClassifier()
    clf6 = GradientBoostingClassifier()
    ##clf1.fit(X_parameters, Y_parameters)
    #clf2.fit(X_parameters, Y_parameters)
    #clf3.fit(X_parameters, Y_parameters)
    clf4.fit(X_parameters, Y_parameters)
    #clf5.fit(X_parameters, Y_parameters)
    clf6.fit(X_parameters, Y_parameters)
    print "finish fitting"
    answer = []
    for line in predict_value:
        line1 = line[1:]
        #predict_outcome1 = clf1.predict(line1)
        #predict_outcome2 = clf2.predict(line1)
        #predict_outcome3 = clf3.predict_proba(line1)
        predict_outcome4 = clf4.predict_proba(line1)
        #predict_outcome5 = clf5.predict_proba(line1)
        predict_outcome6 = clf6.predict_proba(line1)
        #value1 = predict_outcome1[0]
        #value2 = predict_outcome2[0]
        #value3 = predict_outcome3[0][1]
        value4 = predict_outcome4[0][1]
        #value5 = predict_outcome5[0][1]
        value6 = predict_outcome6[0][1]
        data =  (value4+value6)/2
        writer.writerow([line[0],data])
    print "finish learning"
Exemplo n.º 15
0
def plot_PrecisionRecall (X,y):
    # Run classifier
    n_samples, n_features = X.shape
 
    # Split into training and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,random_state=1)
    
    clf = GradientBoostingClassifier(n_estimators=400, learning_rate=0.4, max_depth=6) 
    clf.fit(X_train, y_train)
    

    probas_ = clf.predict_proba(X_test)

    # Compute Precision-Recall and plot curve
    precision, recall, thresholds = precision_recall_curve(y_test, probas_[:,1])
    area = auc(recall, precision)
    print("Area Under Curve: %0.2f" % area)
    
    pl.clf()
    pl.plot(recall, precision, label='Precision-Recall curve')
    pl.xlabel('Recall')
    pl.ylabel('Precision')
    pl.ylim([0.0, 1.05])
    pl.xlim([0.0, 1.0])
    pl.title('Precision-Recall: AUC=%0.2f' % area)
    pl.legend(loc="lower left")
    pl.show()
def train_gbt(filename, color, name):
	'''Train on Gradient Boosted Trees Classifier'''
	# Read data
	data2 = pd.read_csv(filename, encoding="utf")
	X = data2.ix[:, 1:-1]
	y = data2.ix[:, -1]

	# Split into train, validation and test
	X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

	# Define model
	clf1 = GradientBoostingClassifier(learning_rate=0.05, max_depth=5, random_state=42)
	
	# Fit model
	t0 = time()
	clf1.fit(X_train, y_train)
	pred_probas = clf1.predict_proba(X_val)

	predictions = clf1.predict(X_val)
	
	print "Score", clf1.score(X_val, y_val)

	importances = clf1.feature_importances_
	indices = np.argsort(importances)[::-1]
	
	# Metrics & Plotting
	metrics[1, 0] = precision_score(y_val, predictions)
	metrics[1, 1] = recall_score(y_val, predictions)
	metrics[1, 2] = f1_score(y_val, predictions)
	metrics[1, 3] = time() - t0

	fpr_rf, tpr_rf, _ = roc_curve(y_val, predictions)
	plt.plot(fpr_rf, tpr_rf, color=color, label=name)

	return importances, indices
Exemplo n.º 17
0
def test_staged_predict_proba():
    # Test whether staged predict proba eventually gives
    # the same prediction.
    X, y = datasets.make_hastie_10_2(n_samples=1200,
                                     random_state=1)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingClassifier(n_estimators=20)
    # test raise NotFittedError if not fitted
    assert_raises(NotFittedError, lambda X: np.fromiter(
        clf.staged_predict_proba(X), dtype=np.float64), X_test)

    clf.fit(X_train, y_train)

    # test if prediction for last stage equals ``predict``
    for y_pred in clf.staged_predict(X_test):
        assert_equal(y_test.shape, y_pred.shape)

    assert_array_equal(clf.predict(X_test), y_pred)

    # test if prediction for last stage equals ``predict_proba``
    for staged_proba in clf.staged_predict_proba(X_test):
        assert_equal(y_test.shape[0], staged_proba.shape[0])
        assert_equal(2, staged_proba.shape[1])

    assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
Exemplo n.º 18
0
def ada_boost():
    savefile = open('traindata.pkl', 'rb')
    (x_train, y_train, t1) = cPickle.load(savefile)
    savefile.close()
    savefile = open('testdata.pkl', 'rb')
    (x_test, t1, name1) = cPickle.load(savefile)
    savefile.close()
    
#    X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(
#    X, y, test_size=0.1, random_state=42)
    
    x_train = np.asarray(x_train,dtype=np.float32)
    y_train = np.asarray(y_train, dtype='int32')-1   
    
    nest = 190
    lr = .1
    md = 6
#    clf1 = DecisionTreeClassifier(max_depth=2)
#    clf = AdaBoostClassifier(clf1, n_estimators=200, learning_rate=.25)
    clf = GradientBoostingClassifier(n_estimators=nest, learning_rate=lr, max_depth=md, random_state=0)
#    clf = RandomForestClassifier(n_estimators=200) #.81
#    clf = ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=10, random_state=0,n_jobs=8) #.81
#    clf = KNeighborsClassifier(15)
    if 1:
        clf.fit(x_train, y_train)
        ypred = clf.predict_proba(x_test)
        y_str = ['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']
        kcsv.print_csv(ypred, name1, y_str,indexname='id')
        print (nest, lr, md) 
    
    if 0:
        multiclass_log_loss = make_scorer(score_func=logloss_mc, greater_is_better=True, needs_proba=True)
        scores = cross_val_score(clf, x_train, y_train, n_jobs=8, cv=5,scoring=multiclass_log_loss)
        print scores
        print (nest, lr, md, scores.mean())  
Exemplo n.º 19
0
def train():
    posi_result = {}
    train_feature, test_feature, train_id_list, test_id_list, train_tar_list = merge_feature(feature_str)
    tmp1 = [m < 32 for m in trainTarList]
    tmp1 = np.array(tmp1)
    # train_feature = train_feature[tmp1]
    target_list = np.array(trainTarList)
    target_list = target_list[tmp1]
    # train_id_list = np.array(train_id_list)
    # train_id_list = train_id_list[tmp1]
    c_feature = trainFeature.columns[:]
    clf1 = RandomForestClassifier(n_estimators=200, min_samples_split=17)
    clf1.fit(trainFeature[c_feature], target_list)
    # rf_preds = clf1.predict(test_feature)
    rf_prob = clf1.predict_proba(test_feature)
    gbdt1 = GradientBoostingClassifier(n_estimators=150, min_samples_split=17)
    gbdt1.fit(trainFeature[c_feature], target_list)
    # gbdt_preds = gbdt1.predict(test_feature)
    gbdt_prob = gbdt1.predict_proba(test_feature)
    all_prob = rf_prob + gbdt_prob
    all_preds = []
    print all_prob.shape
    for k in range(all_prob.shape[0]):
        prob1 = list(allProb[k, :])
        ind1 = prob.index(max(prob1))
        allPreds.append(ind1)
    for j in range(len(all_preds)):
        all_pre_name = dl.get_num_position(all_preds[j])
        posi_result[test_id_list[j]] = all_pre_name
    return posi_result
Exemplo n.º 20
0
def gradientboost_prediction(features_train, labels_train, features_test, ids):

    class RandomForestClassifier_compability(RandomForestClassifier):
        def predict(self, X):
            return self.predict_proba(X)[:, 1][:,np.newaxis]

    base_estimator = RandomForestClassifier_compability()

    clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.1,
                             n_estimators=5, subsample=0.3,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             max_depth=3,
                             init=base_estimator,
                             random_state=None,
                             max_features=None,
                             verbose=2,
                             learn_rate=None)

    clf = clf.fit(features_train, labels_train)

    pred = clf.predict_proba(features_test)[:,1]

    # feature_importance = clf.feature_importances_
    #
    # print (feature_importance)

    predictions_file = open("data/rf_prediction.csv", "wb")
    predictions_file_object = csv.writer(predictions_file)
    predictions_file_object.writerow(["ID", "TARGET"])
    predictions_file_object.writerows(zip(ids, pred))
    predictions_file.close()
Exemplo n.º 21
0
def main():
	makeSub = True
	featureImportance = False
	cvfold = True
	df = pd.read_csv('../data/cprobTrain15NA.csv')

	X, y = np.array(pd.read_csv('../data/train.csv',usecols=range(1,9))), np.array(pd.read_csv('../data/train.csv').ACTION)
	X = np.hstack((X,np.array(df)))

	params = {'max_depth':4, 'subsample':0.5, 'verbose':0, 'random_state':1337,
		'min_samples_split':10, 'min_samples_leaf':10, 'max_features':10,
		'n_estimators': 350, 'learning_rate': 0.05}	

	clf = GradientBoostingClassifier(**params)
	prefix = 'lib/gbm350d4m10c15'
	if cvfold:
		c = classifier.Classifier(X,y)
		c.validate(clf,nFolds=10,out=prefix+'Train.csv')

	if makeSub:
		Xt = np.array(pd.read_csv('../data/test.csv',usecols=range(1,9)))
		Xt = np.hstack((Xt,np.array(pd.read_csv('../data/cprobTest15NA.csv'))))
		clf.fit(X,y)
		y_ = clf.predict_proba(Xt)[:,1]
		out = pd.read_csv('subs/nbBaseTest.csv')
		out.ACTION = y_
		out.to_csv(prefix+'Test.csv',index=False)

	if featureImportance:
		print "Feature ranking:"
		importances = clf.feature_importances_
		indices = np.argsort(importances)[::-1]
		np.savetxt('indices.txt',indices,delimiter=',')
		for f in xrange(df.shape[1]):
			print "%d. feature (%s,%f)" % (f + 1, df.columns[indices[f]], importances[indices[f]])
Exemplo n.º 22
0
def gbc_gp_predict(train_x, train_y, test_x):
    feature_indexs = getTopFeatures(train_x, train_y)
    sub_x_Train = get_data(
        train_x,
        feature_indexs[:16],
        features.feature_pair_sub_list,
        features.feature_pair_plus_list,
        features.feature_pair_mul_list,
        features.feature_pair_divide_list[:20],
    )
    sub_x_Test = get_data(
        test_x,
        feature_indexs[:16],
        features.feature_pair_sub_list,
        features.feature_pair_plus_list,
        features.feature_pair_mul_list,
        features.feature_pair_divide_list[:20],
    )
    labels = toLabels(train_y)
    gbc = GradientBoostingClassifier(n_estimators=3000, max_depth=9)
    gbc.fit(sub_x_Train, labels)
    pred_probs = gbc.predict_proba(sub_x_Test)[:, 1]
    ind_test = np.where(pred_probs > 0.55)[0]
    gp_preds_part = gbc_gp_predict_part(sub_x_Train, train_y, sub_x_Test[ind_test])
    gp_preds = np.zeros(len(test_x))
    gp_preds[ind_test] = gp_preds_part
    return gp_preds
Exemplo n.º 23
0
def main(args):
    global verbose
    verbose = args.verbose

    # Load files
    if verbose: logger.info('Loading {}'.format(args.train_file))
    train_X, train_y = load_file(args.train_file)
    if verbose: logger.info('Loading {}'.format(args.test_file))
    test_X, test_y = load_file(args.test_file)

    # # Codes for Grid Search
    # params = [
    #     {'n_estimators': [50000], 'learning_rate': [2**i for i in np.arange(-10, -9, .25)], 'max_features': ['log2',], 'max_depth': [7,]},
    # ]
    # method = GradientBoostingClassifier(random_state=1, verbose=1)
    # gscv = GridSearchCV(method, params, scoring='roc_auc', verbose=verbose, n_jobs=5)
    # gscv.fit(train_X.toarray(), train_y)
    # if verbose:
    #     for params, mean_score, all_scores in gscv.grid_scores_:
    #         logger.info('{:.6f} (+/- {:.6f}) for {}'.format(mean_score, all_scores.std() / 2, params))
    #     logger.info('params:{params}'.format(params=gscv.best_params_))
    #     logger.info('score:{params}'.format(params=gscv.best_score_))
    # pred = gscv.best_estimator_.predict_proba(test_X.toarray())

    # Best parameters for the competition data
    method = GradientBoostingClassifier(n_estimators=50000, learning_rate=2**(-9,5),
                                        max_features='log2', max_depth=7
                                        random_state=1, verbose=1)
    method.fit(train_X.toarray(), train_y)
    pred = method.predict_proba(test_X.toarray())

    np.savetxt(args.output, pred[:, 1], fmt='%.6f')
    if verbose: logger.info('Wrote preds to {file}'.format(file=args.output))

    return 0
Exemplo n.º 24
0
def classify2(dis_data, numeric_data, t_label):
    fold = 5
    skf = StratifiedKFold(t_label, fold)
    roc_auc = 0  
    f1_score_value = 0

    clf1 = LogisticRegression()
    clf2 = GradientBoostingClassifier()
#    clf3 = tree.DecisionTreeClassifier(max_depth=500, max_leaf_nodes= 500, class_weight={1:12})
    clf3 = GradientBoostingClassifier()
    
    for train, test in skf:
        clf3 = clf3.fit(dis_data.iloc[train], t_label.iloc[train])
        
        #compute auc
        probas_  = clf3.predict_proba(dis_data.iloc[test])
        fpr, tpr, thresholds = roc_curve(t_label.iloc[test], probas_[:, 0])
        roc_auc += auc(fpr, tpr)    
        
        #compute f1_score
        label_pred = clf3.predict(dis_data.iloc[test])
        
        f1_score_value += f1_score(t_label.iloc[test], label_pred, pos_label= 1)
        
    return roc_auc / fold, f1_score_value / fold     
	 def test(self):
                 #iris = datasets.load_iris()
                 #X, y = iris.data, iris.target
                 X, y = self.dataMat,self.labelMat
                 X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.6, random_state=12)
                 #clf = RandomForestClassifier(max_depth=3,min_samples_split=9,min_samples_leaf=15,n_estimators=5)
                 #for w1 in arange(0.342, 0.347, 0.001):
                 params = {'n_estimators': 1200, 'max_depth': 4, 'subsample': 0.5,'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3};
                 clf_GBC = GradientBoostingClassifier(**params);
                 clf_GBC.fit(X_train, y_train);
                 scores_GBC = cross_val_score(clf_GBC,X,y,cv=3,scoring='roc_auc')
                 clf_RFC = RandomForestClassifier(max_depth=6,min_samples_split=7, min_samples_leaf=9,n_estimators=12);
                 clf_RFC.fit(X_train, y_train);
                 scores_RFC = cross_val_score(clf_RFC,X,y,cv=3,scoring='roc_auc')
                 clf_SVC = SVC(kernel='linear', C= 0.001, probability=True);
                 clf_SVC.fit(X_train, y_train);
                 scores_SVC = cross_val_score(clf_SVC,X,y,cv=3,scoring='roc_auc')
                 for w1 in arange(0.01, 0.99, 0.01):
                   for w2 in arange(0.01, 0.99, 0.01):
                       y_predprob = clf_GBC.predict_proba(X_test)*w1+clf_RFC.predict_proba(X_test)*(1-w2)*(1-w1)+clf_SVC.predict_proba(X_test)*w2*(1-w1);
                       scoremean = scores_GBC.mean()*w1+scores_RFC.mean()*(1-w2)*(1-w1)+scores_SVC.mean()*w2*(1-w1)
                       if scoremean>0.9:
                          print '***********************************************************'
                          print 'GBC-weight =', w1, 'RFC =',(1-w1)*(1-w2), 'SVC =',w2*(1-w1)
                          print 'The log loss is:', log_loss(y_test, y_predprob)
                          print 'The ROC score is:', roc_auc_score(y_test,y_predprob[:,1])
                          scorestd = math.sqrt(scores_GBC.std()**2+scores_RFC.std()**2+scores_SVC.std()**2)
                          print ("Accuracy: %0.5f (+/- %0.5f)" % (scores_GBC.mean()*w1+scores_RFC.mean()*(1-w2)*(1-w1)+scores_SVC.mean()*w2*(1-w1), scorestd*2))
def calc_prob(df_features_driver, df_features_other):

    df_train = df_features_driver.append(df_features_other)
    df_train.reset_index(inplace = True)
    df_train.Driver = df_train.Driver.astype(int)

    # So far, the best result was achieved by using a RandomForestClassifier with Bagging
    # model = BaggingClassifier(base_estimator = ExtraTreesClassifier())
    # model = BaggingClassifier(base_estimator = svm.SVC(gamma=2, C=1))
    # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression())
    # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression())
    # model = BaggingClassifier(base_estimator = AdaBoostClassifier())
    #model = RandomForestClassifier(200)
    # model = BaggingClassifier(base_estimator = [RandomForestClassifier(), linear_model.LogisticRegression()])
    # model = EnsembleClassifier([BaggingClassifier(base_estimator = RandomForestClassifier()),
    #                             GradientBoostingClassifier])
    model = GradientBoostingClassifier(n_estimators = 10000)
    # model = ExtraTreesClassifier(500, criterion='entropy')

    feature_columns = df_train.iloc[:, 4:]

    # Train the classifier
    model.fit(feature_columns, df_train.Driver)
    df_submission = pd.DataFrame()

    df_submission['driver_trip'] = create_first_column(df_features_driver)

    probs_array = model.predict_proba(feature_columns[:200]) # Return array with the probability for every driver
    probs_df = pd.DataFrame(probs_array)

    df_submission['prob'] = np.array(probs_df.iloc[:, 1])

    return df_submission
Exemplo n.º 27
0
def classify():
    ps = {'n_estimators': 155, 'learning_rate': 0.01673821514381137, 'max_depth': 4}
    xx, y, tags, columns = get_data('/home/rodion/facebids/train/join.count.proba.time.csv')

    gbdt = GradientBoostingClassifier(**ps)
    cv = StratifiedKFold(y, 4)

    for a, b in cv:
        y_a, y_b = y[a], y[b]
        xx_a, xx_b = xx[a], xx[b]
        tags_b = tags[b]

        gbdt.fit(xx_a, y_a)

        sort_indices = np.argsort(np.array(gbdt.feature_importances_))[::-1]
        print(np.asarray(gbdt.feature_importances_)[sort_indices])
        print(np.asarray(columns)[sort_indices])

        proba = gbdt.predict_proba(xx_b)
        proba = proba[:, 1]

        sort_indices = np.argsort(proba)
        a = np.array([tags_b[sort_indices], y_b[sort_indices], proba[sort_indices]]).T
        np.savetxt("foo.csv", a, delimiter=",", fmt="%s")

        break
Exemplo n.º 28
0
def gbPredict(LOSS, N_EST, L_RATE, M_DEPT, SUB_S, W_START, N_FOLD, EX_F, TRAIN_DATA_X, TRAIN_DATA_Y, TEST__DATA_X, isProb):
    # feature extraction
    ### clf  = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(TRAIN_DATA_X, TRAIN_DATA_Y)
    ### extA = delFeatMin(clf.feature_importances_, EX_F)
    ### TRAIN_DATA_X = TRAIN_DATA_X[:, extA]
    # k-fold validation
    kf   = KFold(TRAIN_DATA_Y.shape[0], n_folds=N_FOLD)
    tesV = 0.0
    for train_index, test_index in kf:
        X_train, X_test = TRAIN_DATA_X[train_index], TRAIN_DATA_X[test_index]
        y_train, y_test = TRAIN_DATA_Y[train_index], TRAIN_DATA_Y[test_index]
        clf  =  GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(X_train, y_train)
        tesK =  1 - clf.score(X_test, y_test)
        tesV += tesK
    eVal = tesV / N_FOLD
    # train all data
    clf  = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(TRAIN_DATA_X, TRAIN_DATA_Y)
    TEST__DATA_X = TEST__DATA_X[:, extA]
    if isProb:
        data = clf.predict_proba(TEST__DATA_X)
    else:
        data = clf.predict(TEST__DATA_X)

    print "Eval =", eVal, "with n_esti =", N_EST, "l_rate =", L_RATE, "m_dep =", M_DEPT, "sub_s =", SUB_S, "ex_num =", EX_F, "and loss is", LOSS

    return (data, eVal)
Exemplo n.º 29
0
def do_all_study(X,y):
    
    names = [ "Decision Tree","Gradient Boosting",
             "Random Forest", "AdaBoost", "Naive Bayes"]

    classifiers = [
        #SVC(),
        DecisionTreeClassifier(max_depth=10),
        GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1),
        RandomForestClassifier(max_depth=10, n_estimators=20, max_features=1),
        AdaBoostClassifier()]
    for name, clf in zip(names, classifiers):
        estimator,score = plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')


    clf_GBC = GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1)
    param_name = 'n_estimators'
    param_range = [1, 5, 10, 20,40]

    plot_validation_curve(clf_GBC, X_train, y_train,
                          param_name, param_range, scoring='roc_auc')
    clf_GBC.fit(X_train,y_train)
    y_pred_GBC = clf_GBC.predict_proba(X_test)[:,1]
    print("ROC AUC GradientBoostingClassifier: %0.4f" % roc_auc_score(y_test, y_pred_GBC))

    clf_AB = AdaBoostClassifier()
    param_name = 'n_estimators'
    param_range = [1, 5, 10, 20,40]

    plot_validation_curve(clf_AB, X_train, y_train,
                          param_name, param_range, scoring='roc_auc')
    clf_AB.fit(X_train,y_train)
    y_pred_AB = clf_AB.predict_proba(X_test)[:,1]
    print("ROC AUC AdaBoost: %0.4f" % roc_auc_score(y_test, y_pred_AB))
Exemplo n.º 30
0
def main():
    # Set seed for reproducibility
    seed = np.random.seed(42)

    print("# Loading data...")
    train = pd.read_csv('./datasets/numerai_training_data.csv', header=0)
    selected_features = pd.read_csv('./datasets/x_new.csv', header=0)
    tournament = pd.read_csv('./datasets/numerai_tournament_data.csv',
                             header=0)
    validation = tournament[tournament['data_type'] == 'validation']

    train_bernie = train

    features = [f for f in list(selected_features) if "feature" in f]

    X = train_bernie[features]
    Y = train_bernie['target_bernie']
    x_prediction = validation[features]

    ids = tournament['id']

    #CONFIGURE YOUR MODELS:
    #Stochastic Gradient Boosting Classification
    num_trees = 25
    kfold = model_selection.KFold(n_splits=len(train['era'].unique()),
                                  random_state=seed)
    #Configure model
    modelGBC = GradientBoostingClassifier(n_estimators=num_trees,
                                          random_state=seed,
                                          verbose=2)
    #Train and test with kfold model iterations
    #results = model_selection.cross_val_score(modelGBC, X, Y, cv=kfold)
    #print(results.mean())
    #COMMENT IF YOU DON'T WANT TO SAVE THE TRAINED MODEL
    joblib.dump(modelGBC, './models/gradient_boosting_classifier.joblib')
    #UNCOMMENT IF WANT TO LOAD THE TRAINED MODEL
    # modelGBC = joblib.load('gradient_classifier.joblib')
    modelGBC.fit(X, Y)

    #USED TRAINED MODELS AND TEST THEM AGAINST THE TEST SET (x_prediction is the validation set)
    y_prediction = modelGBC.predict_proba(x_prediction)
    probabilities = y_prediction[:, 1]
    print(probabilities)
    print("- probabilities GBC:", probabilities[1:6])
    print("- target:\n", validation['target_bernie'][1:6])
    print("- rounded probability:", [round(p) for p in probabilities][1:6])
    correct = [
        round(x) == y
        for (x, y) in zip(probabilities, validation['target_bernie'])
    ]
    print("- accuracy: ", sum(correct) / float(validation.shape[0]))
    print("- validation logloss:",
          metrics.log_loss(validation['target_bernie'], probabilities))

    # # To submit predictions from your model to Numerai, predict on the entire tournament data.
    print("PREDICTIONS FOR THE TOURNAMENT *******************")

    x_prediction = tournament[features]
    print("\nPREDICTIONS USING GBC")
    y_prediction = modelGBC.predict_proba(x_prediction)
    results = y_prediction[:, 1]
    #results = np.round_(results)
    results_GBC = pd.DataFrame(data={'probability_bernie': results})
    joined = pd.DataFrame(ids).join(results_GBC)
    print("- joined:", joined.head())
    print("# Writing predictions to bernie_submissions_gbc.csv...")
    # Save the predictions out to a CSV file.
    # print("# Creating submission...")
    joined.to_csv("./results/bernie_submission_gbc.csv", index=False)
Exemplo n.º 31
0
    loss_train_by_iter = []
    loss_test_by_iter = []
    
    for predict in predict_train_by_iter:
        loss_value = log_loss(y_train, sigmoid(predict))
        loss_train_by_iter.append(loss_value)

    for predict in predict_test_by_iter:
        loss_value = log_loss(y_test, sigmoid(predict))
        loss_test_by_iter.append(loss_value)

    min_loss_index = np.argmin(loss_test_by_iter)
    print('learning_rate=%s, min_loss_value=%s, iteration(from 1)=%s' % (
        learning_rate,
        loss_test_by_iter[min_loss_index],
        min_loss_index + 1
    ))

    plt.title(learning_rate)
    plt.plot(loss_train_by_iter)
    plt.plot(loss_test_by_iter)
    plt.show()

clf = RandomForestClassifier(n_estimators=37, random_state=241)
clf.fit(X_train, y_train)
prediction = clf.predict_proba(X_test)
loss_value = log_loss(y_test, prediction)
print('Random forest classifier min loss value = ', loss_value)

Exemplo n.º 32
0
n_folds = 5
skf = list(StratifiedKFold(y, n_folds))
for j, clf in enumerate(clfs):
    '''依次训练各个单模型'''
    # print(j, clf)
    dataset_blend_test_j = np.zeros((X_predict.shape[0], len(skf)))
    for i, (train, test) in enumerate(skf):
        '''使用第i个部分作为预测,剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。'''
        # print("Fold", i)
        X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
        clf.fit(X_train, y_train)
        y_submission = clf.predict_proba(X_test)[:, 1]
        dataset_blend_train[test, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1]
    '''对于测试集,直接用这k个模型的预测值均值作为新的特征。'''
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
    print("val auc Score: %f" %
          roc_auc_score(y_predict, dataset_blend_test[:, j]))
# clf = LogisticRegression()
clf = GradientBoostingClassifier(learning_rate=0.02,
                                 subsample=0.5,
                                 max_depth=6,
                                 n_estimators=30)
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

print("Linear stretch of predictions to [0,1]")
y_submission = (y_submission - y_submission.min()) / (y_submission.max() -
                                                      y_submission.min())
print("blend result")
print("val auc Score: %f" % (roc_auc_score(y_predict, y_submission)))
Exemplo n.º 33
0
# In[ ]:


predictions_GBC=GBC.predict(X_test)


# In[ ]:


print(classification_report(y_test,predictions_GBC))


# In[ ]:


predictions_GBC_prob=GBC.predict_proba(X_test)
prob_list_GBC= [x[1] for x in predictions_GBC_prob]
d_GBC={}
for threshold in np.arange(0.0, 1.0, 0.01):
    list_for_check_GBC=np.int_([y>=threshold for y in prob_list_GBC])
    d_GBC[threshold]=f1_score(y_test,list_for_check_GBC)
df_GBC=pd.DataFrame.from_dict(d_GBC,orient='index')


# In[ ]:


df_GBC[df_GBC[0]==df_GBC[0].max()]


# In[ ]:
Exemplo n.º 34
0
# 弱分类器的数目
n_estimator = 10
# 调用GBDT分类模型
grd = GradientBoostingClassifier(n_estimators=n_estimator)

# 调用one-hot编码。
grd_enc = OneHotEncoder()

# 调用LR分类模型。
grd_lm = LogisticRegression()

# 使用X_train训练GBDT模型,后面用此模型构造特征
grd.fit(X_train, y_train)

# 直接进行预测,查看AUC得分
y_pred_grd = grd.predict_proba(X_test)[:, 1]
fpr_grd, tpr_grd, _ = metrics.roc_curve(y_test, y_pred_grd)
roc_auc = metrics.auc(fpr_grd, tpr_grd)
print
'predict', roc_auc

# fit one-hot编码器

tmp = grd.apply(X_train)

grd_enc.fit(grd.apply(X_train)[:, :, 0])

# 使用训练好的GBDT模型构建特征,然后将特征经过one-hot编码作为新的特征输入到LR模型训练。
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

# 用训练好的LR模型多X_test做预测
Exemplo n.º 35
0
print(search.best_estimator_)
print(search.best_score_)

#evaluation
prediction_train = GBDT.predict(feature_train_scaled)
cm_train = confusion_matrix(y_train_decode, prediction_train)
prediction_test = GBDT.predict(feature_test_scaled)
cm_test = confusion_matrix(y_test_decode, prediction_test)

print(
    "Confusion matrix for training dataset is \n%s\n for testing dataset is \n%s.\n"
    % (cm_train, cm_test))

target_names = [
    'class 1', 'class 2', 'class 3', 'class 4', 'class 5', 'class 6',
    'class 7', 'class 8', 'class 9'
]
print(
    classification_report(y_test_decode,
                          prediction_test,
                          target_names=target_names))

y_score = GBDT.predict_proba(feature_test_scaled)

# 计算micro类型的AUC
# print('调用函数auc:', roc_auc_score(y_test, y_score, average='micro'))

fpr, tpr, thresholds = roc_curve(y_test.ravel(), y_score.ravel())
micro_auc = auc(fpr, tpr)
print('micro_auc:', micro_auc)
Exemplo n.º 36
0

if __name__ == '__main__':
    x_data, y_data = load_data()
    X = np.array(x_data)
    Y = np.array(y_data)

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=5)

    # 构造并训练GB分类器模型
    clf = GradientBoostingClassifier(random_state=0)
    clf.fit(x_train, y_train)

    # 预测分类结果
    y_predict = clf.predict(x_test)
    y_pred_probability = clf.predict_proba(x_test)

    df2 = pd.DataFrame(y_pred_probability)
    proba_pred_y = np.array(df2[1])     # 截取样本点预测为正样本的预测概率

    score = clf.score(x_test, y_test)
    print("Gradient Boosting 模型打分: Score = %f" % score)
    accuracy = Get_Accuracy(y_test, y_predict)
    print("Gradient Boosting Accuracy_Score = %f" % accuracy)
    precision = Get_Precision_score(y_test, y_predict)
    print("Gradient Boosting Precision = %f" % precision)
    recall = Get_Recall(y_test, y_predict)
    print("Gradient Boosting Recall = %f" % recall)
    f1_score = Get_f1_score(y_test, y_predict)
    print("Gradient Boosting F1-Score  = %f" % f1_score)
    auc = Get_Auc_value(y_test, proba_pred_y)
Exemplo n.º 37
0
def compare_assessors(X, y):

    n_estimator = 20
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    # It is important to train the ensemble of trees on a different subset
    # of the training data than the linear regression model to avoid
    # overfitting, in particular if the total number of leaves is
    # similar to the number of training samples
    X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                                y_train,
                                                                test_size=0.1)

    # Unsupervised transformation based on totally random trees
    rt = RandomTreesEmbedding(n_estimators=n_estimator, random_state=0)

    rt_lm = LogisticRegression()
    pipeline = make_pipeline(rt, rt_lm)
    pipeline.fit(X_train, y_train)
    y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
    fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

    # Supervised transformation based on random forests
    rf = RandomForestClassifier(n_estimators=n_estimator)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict_proba(X_test)[:, 1]
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)

    # RF + LR
    rf_enc = OneHotEncoder()
    rf_enc.fit(rf.apply(X_train))
    rf_lm = LogisticRegression()
    rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
    y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:,
                                                                           1]
    fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

    # GBT
    grd = GradientBoostingClassifier(n_estimators=n_estimator)
    grd.fit(X_train, y_train)
    y_pred_grd = grd.predict_proba(X_test)[:, 1]
    fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)
    grd.score(X_train, y_train)
    grd.score(X_test, y_test)

    # GBT + LR
    grd_enc = OneHotEncoder()
    grd_enc.fit(grd.apply(X_train)[:, :, 0])
    grd_lm = LogisticRegression()
    grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

    y_pred_grd_lm = grd_lm.predict_proba(
        grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
    fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)

    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')
    plt.plot(fpr_rf, tpr_rf, label='RF')
    plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')
    plt.plot(fpr_grd, tpr_grd, label='GBT')
    plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()
                    fake_test_data = np.squeeze(images[bdt_train_size:])

                    real_training_labels = np.ones(bdt_train_size)

                    fake_training_labels = np.zeros(bdt_train_size)

                    total_training_data = np.concatenate(
                        (real_training_data, fake_training_data))

                    total_training_labels = np.concatenate(
                        (real_training_labels, fake_training_labels))

                    clf.fit(total_training_data, total_training_labels)

                    out_real = clf.predict_proba(real_test_data)

                    out_fake = clf.predict_proba(fake_test_data)

                    if mode != "ROC_testing":
                        plt.hist([out_real[:, 1], out_fake[:, 1]],
                                 bins=100,
                                 label=['real', 'gen'],
                                 histtype='step')
                        plt.xlabel('Output of BDT')
                        plt.legend(loc='upper right')
                        plt.savefig('%s%s/BDT_out.png' %
                                    (working_directory, saving_directory),
                                    bbox_inches='tight')
                        plt.close('all')
Exemplo n.º 39
0
IDcol = 'ID'
#将Disbursed字段的值分类统计数目,0值多少个,1的值多少个
train['Disbursed'].value_counts()
#挑选不是Disbursed和ID的列
x_columns = [x for x in train.columns if x not in [target, IDcol]]
#X为因子矩阵
X = train[x_columns]
#y是结果矩阵
y = train['Disbursed']
#gbm0为
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(X, y)
#根据X值预测
y_pred = gbm0.predict(X)
#每个X样本为1的概率
y_predprob = gbm0.predict_proba(X)[:, 1]
#打印分类准确的百分比。
print "Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred)
#直接根据真实值(必须是二值)、预测值(可以是0/1,也可以是proba值)计算出auc值,中间过程的roc计算省略
print "AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob)
#range(start,stop ,step)
param_test1 = {'n_estimators': range(20, 81, 10)}
gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(
    learning_rate=0.1,
    min_samples_split=300,
    min_samples_leaf=20,
    max_depth=8,
    max_features='sqrt',
    subsample=0.8,
    random_state=10),
                        param_grid=param_test1,
Exemplo n.º 40
0
X_train, X_test, y_train_named, y_test_named, y_train, y_test = train_test_split(
    X, y_named, y, random_state=0)

# Build the gradient boosting model
gbrt = GradientBoostingClassifier(random_state=0).fit(X_train, y_train)

print("X_test.shape: {}".format(X_test.shape))
print("Decision function shape: {}".format(
    gbrt.decision_function(X_test).shape))
df = gbrt.decision_function(X_test)
print("Thresholded decision function:\n{}".format(
    gbrt.decision_function(X_test) > 0))
greater_zero = (gbrt.decision_function(X_test) > 0).astype(int)

# Predicting probabilities
print("Shape of probabilities: {}".format(gbrt.predict_proba(X_test).shape))
pp = gbrt.predict_proba(X_test)
print(pp[0, 0] + pp[0, 1])

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(13, 5))
mglearn.tools.plot_2d_separator(gbrt,
                                X,
                                ax=axes[0],
                                alpha=.4,
                                fill=True,
                                cm=mglearn.cm2)
scores_image = mglearn.tools.plot_2d_scores(gbrt,
                                            X,
                                            ax=axes[1],
                                            alpha=.4,
Exemplo n.º 41
0
    'home_ownership', 'verification_status', 'desc_clean', 'purpose',
    'zip_code', 'addr_state', 'pub_rec_bankruptcies_clean'
]

v = DictVectorizer(sparse=False)
X1 = v.fit_transform(trainData[cat_features].to_dict('records'))
#将独热编码和数值型变量放在一起进行模型训练
X2 = np.matrix(trainData[num_features])
X = np.hstack([X1, X2])
y = trainData['y']
# 未经调参进行GBDT模型训练
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(X, y)

y_pred = gbm0.predict(X)
y_predprob = gbm0.predict_proba(X)[:, 1].T
print "Accuracy : %.4g" % metrics.accuracy_score(y, y_pred)
print "AUC Score (Train): %f" % metrics.roc_auc_score(np.array(y.T),
                                                      y_predprob)
'''
第四步:在测试集上测试模型的性能
'''
# 将带%的百分比变为浮点数
testData['int_rate_clean'] = testData['int_rate'].map(
    lambda x: float(x.replace('%', '')) / 100)
# 将工作年限进行转化,否则影响排序
testData['emp_length_clean'] = testData['emp_length'].map(CareerYear)
# 将desc的缺失作为一种状态,非缺失作为另一种状态
testData['desc_clean'] = testData['desc'].map(DescExisting)
# 处理日期。earliest_cr_line的格式不统一,需要统一格式且转换成python的日期
testData['app_date_clean'] = testData['issue_d'].map(
Exemplo n.º 42
0
shu = data_21
X = shu
X = scale(shu)
y = label
sepscores = []
cv_clf = GradientBoostingClassifier(n_estimators=2000,
                                    max_depth=6,
                                    learning_rate=0.01)
skf = StratifiedKFold(n_splits=5)
ytest = np.ones((1, 2)) * 0.5
yscore = np.ones((1, 2)) * 0.5
for train, test in skf.split(X, y):
    y_train = utils.to_categorical(y[train])
    hist = cv_clf.fit(X[train], y[train])
    y_score = cv_clf.predict_proba(X[test])
    yscore = np.vstack((yscore, y_score))
    y_test = utils.to_categorical(y[test])
    ytest = np.vstack((ytest, y_test))
    fpr, tpr, _ = roc_curve(y_test[:, 0], y_score[:, 0])
    roc_auc = auc(fpr, tpr)
    y_class = utils.categorical_probas_to_classes(y_score)
    y_test_tmp = y[test]
    acc, precision, npv, sensitivity, specificity, mcc, f1 = utils.calculate_performace(
        len(y_class), y_class, y_test_tmp)
    sepscores.append(
        [acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc])
    print(
        'GTB:acc=%f,precision=%f,npv=%f,sensitivity=%f,specificity=%f,mcc=%f,f1=%f,roc_auc=%f'
        % (acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc))
scores = np.array(sepscores)
Exemplo n.º 43
0
print(type(X.toarray()))
print(X.todense().shape)

# In[11]:

# create dataframe
import pandas as pd
df = pd.DataFrame(X.toarray())
df.columns = cols
df["label"] = y
df.head()

# In[12]:

# test
y_pred = [x[1] for x in sk_gbt.predict_proba(df[cols])]
df["pred"] = y_pred
df.head()

# In[13]:


# auc
def auc(y_true, y_pred):
    """
    calculate auc
    Args:
        y_true: label
        y_pred: predict
    Return:
        auc
Exemplo n.º 44
0
print('\nPredicted probabilities:')
print(adaboost_y_pred_prob[:5, ])
print("Error: {0:.2f}".format(adaboost.estimator_errors_[0]))
print("Tree importance: {0:.2f}".format(adaboost.estimator_weights_[0]))

# GradientBoosting Trees
gbc = GradientBoostingClassifier(max_depth=1,
                                 n_estimators=1000,
                                 warm_start=True,
                                 random_state=seed)

gbc.fit(x_train, y_train)

# predictions
gbc_y_pred = gbc.predict(x_test)
gbc_y_pred_prob = gbc.predict_proba(x_test)

# log loss
gbc_accuracy = accuracy_score(y_test, gbc_y_pred)
gbc_logloss = log_loss(y_test, gbc_y_pred_prob)

print("== Gradient Boosting ==")
print("Accuracy: {0:.2f}".format(gbc_accuracy))
print("Log loss: {0:.2f}".format(gbc_logloss))

print("True labels:")
print(y_test[:5, ])
print('\nPredicted labels:')
print(gbc_y_pred[:5, ])
print('\nPredicted probabilities:')
print(gbc_y_pred_prob[:5, ])
        # find patients with a certain disease in target domain
        target_train_feature_true = train_ori.loc[:, disease_list.iloc[disease_num, 0]] > 0
        target_train_meaningful_sample = train_ori.loc[target_train_feature_true]

        # get patients with small disease in test dataset (target domain's test sample)
        target_test_feature_true = test_ori.loc[:, disease_list.iloc[disease_num, 0]] > 0
        target_test_meaningful_sample = test_ori.loc[target_test_feature_true]
        X_test = target_test_meaningful_sample.drop(['Label'], axis=1)
        y_test = target_test_meaningful_sample['Label']
        # # transfer to X_test
        # fit_test = X_test * Weight_importance_source_data
        # fit_test = fit_test * Weight_importance_from_middle_data

        # use source model to predict each group disease's AUC
        y_predict_by_source_model = gbm_All.predict_proba(X_test)[: , 1]
        auc_by_source_model = roc_auc_score(y_test , y_predict_by_source_model)
        auc_source_dataframe.loc[disease_list.iloc[disease_num , 0] , auc_global_dataframe_columns[data_num - 1]] = auc_by_source_model

        # use middle model to predict each group disease's AUC
        y_predict_by_middle_model = gbm_large_group.predict_proba(X_test)[:, 1]
        auc_by_middle_model = roc_auc_score(y_test, y_predict_by_middle_model)
        auc_middle_dataframe.loc[disease_list.iloc[disease_num, 0], auc_global_dataframe_columns[data_num - 1]] = auc_by_middle_model

        # 按不同的sample_size,df.sample进行随机抽样
        for frac in sample_size:
            auc_list = []
            i = 0
            while i < 10:
                # random sampling for test auc
                random_sampling_train_meaningful_sample = target_train_meaningful_sample.sample(frac=frac, axis=0)
Exemplo n.º 46
0
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=5, random_state=2)
forest.fit(X_train, y_train)

#----------------
# Gradient boosting
from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(learning_rate=0.01,random_state=0)
gbrt.fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))
print(gbrt.predict_proba(X_test[:16]))
#----------------
# SVM - важна предобработка
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)

print("Accuracy on training set: {:.2f}".format(svc.score(X_train, y_train)))
print("Accuracy on test set: {:.2f}".format(svc.score(X_test, y_test)))

#----------------
# MLPClassifier - многослойный перцептрон от sklearn - важна предобработка
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(solver='lbfgs', random_state=0, hidden_layer_sizes=[10])
Exemplo n.º 47
0
def main():
    # load csv files

    df = pd.read_csv('/home/clintone/MIMICmaterialized/oasis.csv')

    # Create dataframe with icustay_id and icustay_expire_flag
    df_flag = df[['icustay_id', 'icustay_age_group',
                  'icustay_expire_flag']].copy()

    # create target variable
    y = df['icustay_expire_flag'].copy()

    # create X variable
    X = df[['age_score', 'preiculos_score', 'gcs_score', 'heartrate_score', \
                           'meanbp_score', 'resprate_score', 'temp_score','urineoutput_score', \
                           'mechvent_score','electivesurgery_score']].copy()

    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.33,
                                                        random_state=0,
                                                        stratify=y)

    # Train and fit model
    rf = GradientBoostingClassifier(random_state=0, learning_rate= 0.01, \
                             max_features='sqrt', max_leaf_nodes=12,
                             n_estimators=1250 )

    # Created dataframe with subsection of X_test data
    # We will be using this to get the predicted probablities from the published OASIS model.
    # We will use these predicted probabilites to creat our ROC curve, and use that as our baseline/yardstick.
    y_ind_prob = df.loc[X_test.index]

    # Fit model
    rf.fit(X_train, y_train)

    # Test Prediction
    pred = rf.predict(X_test)
    print('Accuracy score: {:.3}'.format(rf.score(X_test, y_test)))

    # Get predicted probabilites
    y_predict_proba = rf.predict_proba(X_test)

    # Get predicted probabilites of 1 (Death)
    y_proba = y_predict_proba[:, 1]

    # Get AUROC score
    print('AUROC: {:.3}'.format(roc_auc_score(y_test, y_proba)))

    # Calculate Standard Mortality Rate (SMR)
    SMR = sum(y_test) / sum(pred)
    print('SMR: {:.3}'.format(SMR))
    # (different way) print('SMR: {:.3}'.format(sum(y_test)/sum(pred)))

    # Calculate Brier score the long way
    difference = y_proba - y_test
    squared = np.square(difference)
    Brier = np.mean(squared)
    print('Brier Score: {:.3}'.format(Brier))

    # I later found out that SkLearn has its own method to calculate Brier score, I added this as a check to make sure my code was correct.
    print('Brier Score [SKLEARN]: {:.3}'.format(
        brier_score_loss(y_test, y_proba)))

    # (different way) to do the above ---> print('Brier Score: {:.3}'.format(np.mean(np.square(y_proba - y_test))))

    # This is to calculate Brier score for the published OASIS predicted scores
    print('Brier Score [IND]: {:.3}'.format(
        np.mean(np.square(y_ind_prob['oasis_prob'] - y_test))))

    # calculate the fpr and tpr for all thresholds of the classification
    # probs = model.predict_proba(X_test)
    # preds = probs[:,1]
    fpr, tpr, threshold = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)

    # ROC curve for published OASIS model
    fpr_IND, tpr_IND, threshold = roc_curve(y_test, y_ind_prob['oasis_prob'])
    roc_auc_IND = auc(fpr_IND, tpr_IND)

    # Plot ROC curves

    plt.title('Receiver Operating Characteristic')

    plt.plot(fpr, tpr, 'b', label='AUC_OASIS = %0.3f' % roc_auc)
    plt.plot(fpr_IND, tpr_IND, 'g', label='AUC_IND = %0.3f' % roc_auc_IND)

    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
Exemplo n.º 48
0
#make predictions for test data

y_pred_xgb = xgb_clf.predict(X_test)
predictions = [round(value) for value in y_pred_xgb]

# evaluate predictions
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# In[38]:

#plot ROC
y_prob_gb = gb_clf.predict_proba(X_test)
y_score_gb = y_prob_gb[:, 1]
fpr_gb, tpr_gb, threshold_gb = roc_curve(y_test, y_score_gb)
auc = accuracy_score(y_test, y_pred_gb)
plt.plot(fpr_gb,
         tpr_gb,
         label='Gradient Boosting Classifier,auc  = %0.2f' % auc)

y_prob_xgb = xgb_clf.predict_proba(X_test)
y_score_xgb = y_prob_xgb[:, 1]
fpr_xgb, tpr_xgb, threshold_xgb = roc_curve(y_test, y_score_xgb)
auc = accuracy_score(y_test, y_pred_xgb)
plt.plot(fpr_xgb, tpr_xgb, label='XGBoosting Classifier,auc  = %0.2f' % auc)

# ROC curve plotting
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
Exemplo n.º 49
0
def GB(x_train,y_train,x_test):
	GB=GradientBoostingClassifier(n_estimators=200, learning_rate=0.1,max_depth=6)
	GB.fit(x_train,y_train)
	return GB.predict_proba(x_test)
Exemplo n.º 50
0
    clf.fit(X_bags, y)
    score = cross_val_score(estimator=clf, X=X_bags, y=y,
                            cv=folds, scoring='roc_auc').mean()
    print('Logistic Regression with bag-of-words, ROC-AUC Score: {0:.6f}'.format(score), '\n')

# What is the minimum\maximum value of the forecast on
# the test sample came from the best of the algorithms?

if not Bags:
    clf.fit(X_bags, y)

features_test = read_csv('features_test.csv', index_col='match_id')
features_test_raw = read_csv('features_test.csv', index_col='match_id')
features_test.drop(heroes+lt+st, inplace=True, axis=1)
features_test.fillna(0, inplace=True)
X_test_no_bag = features_test.ix[:, :]
X_test_no_bag = scale(X_test_no_bag)

# Bag-of-words for the heroes of the test data:
X_pick = np.zeros((len(features_test_raw), N_words))
for i, match_id in enumerate(features_test_raw.index):
    for p in range(5):
        X_pick[i, features_test_raw.ix[match_id, 'r%d_hero' % (p + 1)] - 1] = 1
        X_pick[i, features_test_raw.ix[match_id, 'd%d_hero' % (p + 1)] - 1] = -1
X_test = np.concatenate([X_test_no_bag, X_pick], axis=1)

# Min/max values:
proba = clf.predict_proba(X_test)[:, 1]
pmax, pmin = np.amax(proba), np.amin(proba)
print('Max proba: {0:.6f},\nMin proba: {1:.6f}'.format(pmax, pmin))
Exemplo n.º 51
0
print 'attributes with gaps \n{}'.format(
    train_X.count()[lambda x: x < len(train_X)])

train_X = train_X.fillna(0).as_matrix()

cv = KFold(len(train_y), n_folds=5, shuffle=True, random_state=241)
for estimators in range(10, 31, 10):
    clf = GradientBoostingClassifier(n_estimators=estimators, random_state=241)

    start_time = datetime.datetime.now()
    auc_score = []

    for traincv, testcv in cv:
        clf.fit(train_X[traincv], train_y[traincv])
        pred = clf.predict_proba(train_X[testcv])[:, 1]
        auc_score.append(metrics.roc_auc_score(train_y[testcv], pred))

    elapsed_time = datetime.datetime.now() - start_time
    print 'estimators: {0} , auc score: {1:.2f}, time elapsed: {2}'.format(
        estimators, np.mean(auc_score), elapsed_time)

# Part 2 logistic regression
print "Logistic regression"

heroes_columns = [
    'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero',
    'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'
]

Exemplo n.º 52
0
class MembershipInferenceBlackBox(MembershipInferenceAttack):
    """
    Implementation of a learned black-box membership inference attack.

    This implementation can use as input to the learning process probabilities/logits or losses,
    depending on the type of model and provided configuration.
    """

    attack_params = MembershipInferenceAttack.attack_params + [
        "input_type",
        "attack_model_type",
        "attack_model",
    ]
    _estimator_requirements = (BaseEstimator, (ClassifierMixin, RegressorMixin))

    def __init__(
        self,
        estimator: Union["CLASSIFIER_TYPE", "REGRESSOR_TYPE"],
        input_type: str = "prediction",
        attack_model_type: str = "nn",
        attack_model: Optional[Any] = None,
    ):
        """
        Create a MembershipInferenceBlackBox attack instance.

        :param estimator: Target estimator.
        :param attack_model_type: the type of default attack model to train, optional. Should be one of `nn` (for neural
                                  network, default), `rf` (for random forest) or `gb` (gradient boosting). If
                                  `attack_model` is supplied, this option will be ignored.
        :param input_type: the type of input to train the attack on. Can be one of: 'prediction' or 'loss'. Default is
                           `prediction`. Predictions can be either probabilities or logits, depending on the return type
                           of the model. If the model is a regressor, only `loss` can be used.
        :param attack_model: The attack model to train, optional. If none is provided, a default model will be created.
        """

        super().__init__(estimator=estimator)
        self.input_type = input_type
        self.attack_model_type = attack_model_type
        self.attack_model = attack_model

        self._regressor_model = RegressorMixin in type(self.estimator).__mro__

        self._check_params()

        if self.attack_model:
            self.default_model = False
            self.attack_model_type = "None"
        else:
            self.default_model = True
            if self.attack_model_type == "nn":
                import torch  # lgtm [py/repeated-import] lgtm [py/import-and-import-from]
                from torch import nn  # lgtm [py/repeated-import]

                class MembershipInferenceAttackModel(nn.Module):
                    """
                    Implementation of a pytorch model for learning a membership inference attack.

                    The features used are probabilities/logits or losses for the attack training data along with
                    its true labels.
                    """

                    def __init__(self, num_classes, num_features=None):

                        self.num_classes = num_classes
                        if num_features:
                            self.num_features = num_features
                        else:
                            self.num_features = num_classes

                        super().__init__()

                        self.features = nn.Sequential(
                            nn.Linear(self.num_features, 512),
                            nn.ReLU(),
                            nn.Linear(512, 100),
                            nn.ReLU(),
                            nn.Linear(100, 64),
                            nn.ReLU(),
                        )

                        self.labels = nn.Sequential(
                            nn.Linear(self.num_classes, 256),
                            nn.ReLU(),
                            nn.Linear(256, 64),
                            nn.ReLU(),
                        )

                        self.combine = nn.Sequential(
                            nn.Linear(64 * 2, 1),
                        )

                        self.output = nn.Sigmoid()

                    def forward(self, x_1, label):
                        """Forward the model."""
                        out_x1 = self.features(x_1)
                        out_l = self.labels(label)
                        is_member = self.combine(torch.cat((out_x1, out_l), 1))
                        return self.output(is_member)

                if self.input_type == "prediction":
                    num_classes = estimator.nb_classes  # type: ignore
                    self.attack_model = MembershipInferenceAttackModel(num_classes)
                else:
                    if self._regressor_model:
                        self.attack_model = MembershipInferenceAttackModel(1, num_features=1)
                    else:
                        num_classes = estimator.nb_classes  # type: ignore
                        self.attack_model = MembershipInferenceAttackModel(num_classes, num_features=1)
                self.epochs = 100
                self.batch_size = 100
                self.learning_rate = 0.0001
            elif self.attack_model_type == "rf":
                self.attack_model = RandomForestClassifier()
            elif self.attack_model_type == "gb":
                self.attack_model = GradientBoostingClassifier()

    def fit(  # pylint: disable=W0613
        self,
        x: np.ndarray,
        y: np.ndarray,
        test_x: np.ndarray,
        test_y: np.ndarray,
        pred: Optional[np.ndarray] = None,
        test_pred: Optional[np.ndarray] = None,
        **kwargs
    ):
        """
        Train the attack model.

        :param x: Records that were used in training the target estimator.
        :param y: True labels for `x`.
        :param test_x: Records that were not used in training the target estimator.
        :param test_y: True labels for `test_x`.
        :param pred: Estimator predictions for the records, if not supplied will be generated by calling the estimators'
                     `predict` function. Only relevant for input_type='prediction'.
        :param test_pred: Estimator predictions for the test records, if not supplied will be generated by calling the
                          estimators' `predict` function. Only relevant for input_type='prediction'.
        :return: An array holding the inferred membership status, 1 indicates a member and 0 indicates non-member.
        """
        if self.estimator.input_shape is not None:
            if self.estimator.input_shape[0] != x.shape[1]:  # pragma: no cover
                raise ValueError("Shape of x does not match input_shape of estimator")
            if self.estimator.input_shape[0] != test_x.shape[1]:  # pragma: no cover
                raise ValueError("Shape of test_x does not match input_shape of estimator")

        if not self._regressor_model:
            y = check_and_transform_label_format(y, len(np.unique(y)), return_one_hot=True)
            test_y = check_and_transform_label_format(test_y, len(np.unique(test_y)), return_one_hot=True)

        if y.shape[0] != x.shape[0]:  # pragma: no cover
            raise ValueError("Number of rows in x and y do not match")
        if test_y.shape[0] != test_x.shape[0]:  # pragma: no cover
            raise ValueError("Number of rows in test_x and test_y do not match")

        # Create attack dataset
        # uses final probabilities/logits
        if self.input_type == "prediction":
            # members
            if pred is None:
                features = self.estimator.predict(x).astype(np.float32)
            else:
                features = pred.astype(np.float32)
            # non-members
            if test_pred is None:
                test_features = self.estimator.predict(test_x).astype(np.float32)
            else:
                test_features = test_pred.astype(np.float32)
        # only for models with loss
        elif self.input_type == "loss":
            # members
            features = self.estimator.compute_loss(x, y).astype(np.float32).reshape(-1, 1)
            # non-members
            test_features = self.estimator.compute_loss(test_x, test_y).astype(np.float32).reshape(-1, 1)
        else:  # pragma: no cover
            raise ValueError("Illegal value for parameter `input_type`.")

        # members
        labels = np.ones(x.shape[0])
        # non-members
        test_labels = np.zeros(test_x.shape[0])

        x_1 = np.concatenate((features, test_features))
        x_2 = np.concatenate((y, test_y))
        y_new = np.concatenate((labels, test_labels))

        if self._regressor_model:
            x_2 = x_2.astype(np.float32).reshape(-1, 1)

        if self.default_model and self.attack_model_type == "nn":
            import torch  # lgtm [py/repeated-import] lgtm [py/import-and-import-from]
            from torch import nn  # lgtm [py/repeated-import]
            from torch import optim  # lgtm [py/repeated-import]
            from torch.utils.data import DataLoader  # lgtm [py/repeated-import]
            from art.utils import to_cuda

            loss_fn = nn.BCELoss()
            optimizer = optim.Adam(self.attack_model.parameters(), lr=self.learning_rate)  # type: ignore

            attack_train_set = self._get_attack_dataset(f_1=x_1, f_2=x_2, label=y_new)
            train_loader = DataLoader(attack_train_set, batch_size=self.batch_size, shuffle=True, num_workers=0)

            self.attack_model = to_cuda(self.attack_model)  # type: ignore
            self.attack_model.train()  # type: ignore

            for _ in range(self.epochs):
                for (input1, input2, targets) in train_loader:
                    input1, input2, targets = to_cuda(input1), to_cuda(input2), to_cuda(targets)
                    _, input2 = torch.autograd.Variable(input1), torch.autograd.Variable(input2)
                    targets = torch.autograd.Variable(targets)

                    optimizer.zero_grad()
                    outputs = self.attack_model(input1, input2)  # type: ignore
                    loss = loss_fn(outputs, targets.unsqueeze(1))  # lgtm [py/call-to-non-callable]

                    loss.backward()
                    optimizer.step()
        else:
            y_ready = check_and_transform_label_format(y_new, len(np.unique(y_new)), return_one_hot=False)
            self.attack_model.fit(np.c_[x_1, x_2], y_ready)  # type: ignore

    def infer(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray:
        """
        Infer membership in the training set of the target estimator.

        :param x: Input records to attack.
        :param y: True labels for `x`.
        :param probabilities: a boolean indicating whether to return the predicted probabilities per class, or just
                              the predicted class
        :return: An array holding the inferred membership status, 1 indicates a member and 0 indicates non-member,
                 or class probabilities.
        """
        if y is None:  # pragma: no cover
            raise ValueError("MembershipInferenceBlackBox requires true labels `y`.")

        if self.estimator.input_shape is not None:  # pragma: no cover
            if self.estimator.input_shape[0] != x.shape[1]:
                raise ValueError("Shape of x does not match input_shape of estimator")

        if "probabilities" in kwargs.keys():
            probabilities = kwargs.get("probabilities")
        else:
            probabilities = False

        if not self._regressor_model:
            y = check_and_transform_label_format(y, len(np.unique(y)), return_one_hot=True)

        if y.shape[0] != x.shape[0]:  # pragma: no cover
            raise ValueError("Number of rows in x and y do not match")

        if self.input_type == "prediction":
            features = self.estimator.predict(x).astype(np.float32)
        elif self.input_type == "loss":
            features = self.estimator.compute_loss(x, y).astype(np.float32).reshape(-1, 1)

        if self._regressor_model:
            y = y.astype(np.float32).reshape(-1, 1)

        if self.default_model and self.attack_model_type == "nn":
            import torch  # lgtm [py/repeated-import] lgtm [py/import-and-import-from]
            from torch.utils.data import DataLoader  # lgtm [py/repeated-import]
            from art.utils import to_cuda, from_cuda

            self.attack_model.eval()  # type: ignore
            inferred: Optional[np.ndarray] = None
            test_set = self._get_attack_dataset(f_1=features, f_2=y)
            test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False, num_workers=0)
            for input1, input2, _ in test_loader:
                input1, input2 = to_cuda(input1), to_cuda(input2)
                outputs = self.attack_model(input1, input2)  # type: ignore
                if not probabilities:
                    predicted = torch.round(outputs)
                else:
                    predicted = outputs
                predicted = from_cuda(predicted)

                if inferred is None:
                    inferred = predicted.detach().numpy()
                else:
                    inferred = np.vstack((inferred, predicted.detach().numpy()))

            if inferred is not None:
                if not probabilities:
                    inferred_return = np.round(inferred)
                else:
                    inferred_return = inferred
            else:  # pragma: no cover
                raise ValueError("No data available.")
        elif not self.default_model:
            # assumes the predict method of the supplied model returns probabilities
            pred = self.attack_model.predict(np.c_[features, y])  # type: ignore
            if probabilities:
                inferred_return = pred
            else:
                inferred_return = np.round(pred)
        else:
            pred = self.attack_model.predict_proba(np.c_[features, y])  # type: ignore
            if probabilities:
                inferred_return = pred[:, [1]]
            else:
                inferred_return = np.round(pred[:, [1]])

        return inferred_return

    def _get_attack_dataset(self, f_1, f_2, label=None):
        from torch.utils.data.dataset import Dataset

        class AttackDataset(Dataset):
            """
            Implementation of a pytorch dataset for membership inference attack.

            The features are probabilities/logits or losses for the attack training data (`x_1`) along with
            its true labels (`x_2`). The labels (`y`) are a boolean representing whether this is a member.
            """

            def __init__(self, x_1, x_2, y=None):
                import torch  # lgtm [py/repeated-import] lgtm [py/import-and-import-from]

                self.x_1 = torch.from_numpy(x_1.astype(np.float64)).type(torch.FloatTensor)
                self.x_2 = torch.from_numpy(x_2.astype(np.int32)).type(torch.FloatTensor)

                if y is not None:
                    self.y = torch.from_numpy(y.astype(np.int8)).type(torch.FloatTensor)
                else:
                    self.y = torch.zeros(x_1.shape[0])

            def __len__(self):
                return len(self.x_1)

            def __getitem__(self, idx):
                if idx >= len(self.x_1):  # pragma: no cover
                    raise IndexError("Invalid Index")

                return self.x_1[idx], self.x_2[idx], self.y[idx]

        return AttackDataset(x_1=f_1, x_2=f_2, y=label)

    def _check_params(self) -> None:
        if self.input_type not in ["prediction", "loss"]:
            raise ValueError("Illegal value for parameter `input_type`.")

        if self._regressor_model:
            if self.input_type != "loss":
                raise ValueError("Illegal value for parameter `input_type` when estimator is a regressor.")

        if self.attack_model_type not in ["nn", "rf", "gb"]:
            raise ValueError("Illegal value for parameter `attack_model_type`.")

        if self.attack_model:
            if ClassifierMixin not in type(self.attack_model).__mro__:
                raise TypeError("Attack model must be of type Classifier.")
model_roc_auc = roc_auc_score(y_test, model.predict(X_test))
fprB_1, tprB_1, thresholdsB_1 = roc_curve(y_test, model.predict_proba(X_test)[:,1])


# Modeling Gradient Boosting Algorithm
GBbaseline = GradientBoostingClassifier()
GBbaseline.fit(X_train,y_train)
y_pred_GB = GBbaseline.predict(X_test)
predictions_GB = [round(value) for value in y_pred_GB]

accuracy_GB = accuracy_score(y_test, predictions_GB)
print(accuracy_GB)

### ROC Curve for Gradient Boosting Algorithm
GBbaseline_roc_auc = roc_auc_score(y_test, GBbaseline.predict(X_test))
fpr1_1, tpr1_1, thresholds1_1 = roc_curve(y_test, GBbaseline.predict_proba(X_test)[:,1])

### Logistic Regression 
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred_log = logreg.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred_log)
print(confusion_matrix)

accuracy_LOG = accuracy_score(y_test, y_pred_log)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_log))
Exemplo n.º 54
0
#clf4 = svm.SVC(kernel='poly', probability=True, C=1.0)
#clf4.fit(X_train, y_train)
#Broken

#from sklearn.linear_model import LogisticRegression

#rand = RandomTreesEmbedding(n_jobs=-1, n_estimators=1000, min_samples_split=1)
#clf3 = rt_lm = LogisticRegression()
#pipeline = make_pipeline(rand, clf3)
#pipeline.fit(X_train, y_train)

#clf5 = svm.SVC(kernel='rbf', probability=True, C=1.0)
#clf5.fit(X_train, y_train)
#.560

predictions1 = clf1.predict_proba(X_test)[:, 1]
sample['WnvPresent'] = predictions1
sample.to_csv('Visualization1.csv', index=False)

predictions2 = clf2.predict_proba(X_test)[:, 1]
sample['WnvPresent'] = predictions2
sample.to_csv('Visualization2.csv', index=False)

predictions3 = clf3.predict_proba(X_test)[:, 1]
sample['WnvPresent'] = predictions3
sample.to_csv('Visualization3.csv', index=False)

predictions4 = clf4.predict_proba(X_test)[:, 1]
sample['WnvPresent'] = predictions4
sample.to_csv('Visualization4.csv', index=False)
Exemplo n.º 55
0
    # Make the testing
    rf_data_answer = []
    # Test each array with the validation set and use it train the next
    # classifier
    pred_proba = crf_t2w.predict_proba(t2w_testing_data)
    pos_class_arg = np.ravel(np.argwhere(crf_t2w.classes_ == 1))[0]
    rf_data_answer.append(pred_proba[:, pos_class_arg])
    pred_proba = crf_adc.predict_proba(adc_testing_data)
    pos_class_arg = np.ravel(np.argwhere(crf_adc.classes_ == 1))[0]
    rf_data_answer.append(pred_proba[:, pos_class_arg])
    # pred_proba = crf_mrsi.predict_proba(mrsi_testing_data)
    # pos_class_arg = np.ravel(np.argwhere(crf_mrsi.classes_ == 1))[0]
    # rf_data_answer.append(pred_proba[:, pos_class_arg])
    pred_proba = crf_dce.predict_proba(dce_testing_data)
    pos_class_arg = np.ravel(np.argwhere(crf_dce.classes_ == 1))[0]
    rf_data_answer.append(pred_proba[:, pos_class_arg])

    # For know we will train a classifier using the previous probability
    # extracted
    rf_data_answer = np.vstack(rf_data_answer).T

    pred_prob = cgb.predict_proba(rf_data_answer)
    result_cv.append([pred_prob, cgb.classes_])

# Save the information
path_store = '/data/prostate/results/mp-mri-prostate/exp-5/stacking'
if not os.path.exists(path_store):
    os.makedirs(path_store)
joblib.dump(result_cv, os.path.join(path_store, 'results.pkl'))
Exemplo n.º 56
0
    print 'Output : ', outfile

# evaluate training results
if args.evaluate:
    util.plot_clf_results_sklearn(bdt,
                                  x_train,
                                  y_train,
                                  w_train,
                                  x_test,
                                  y_test,
                                  w_test,
                                  figname=args.outdir + "bdtoutput.png",
                                  verbose=(not args.quiet))

    util.print_variables_rank(bdt,
                              var,
                              outname=args.outdir + 'ranks.txt',
                              verbose=(not args.quiet))

    #y_pred = bdt.decision_function(x_test)#.ravel()
    y_pred_test = bdt.predict_proba(x_test)[:, 1]
    y_pred_train = bdt.predict_proba(x_train)[:, 1]
    #util.plot_roc((y_test, y_pred, w_test), figname=args.outdir+'roc.png',
    #              verbose=(not args.quiet))
    datalist = [(y_train, y_pred_train, w_train, 'train'),
                (y_test, y_pred_test, w_test, 'test')]
    util.plot_rocs(datalist,
                   figname=args.outdir + 'roc.png',
                   verbose=(not args.quiet),
                   title='')
Exemplo n.º 57
0
data = pandas.concat([sig, bkg])
train, test = train_test_split(data, test_size=0.33, random_state=42)

clf = GradientBoostingClassifier(learning_rate=0.01,
                                 n_estimators=1000,
                                 subsample=0.8,
                                 random_state=13,
                                 max_features=len(features),
                                 verbose=1,
                                 min_samples_leaf=int(0.01 * len(train)),
                                 max_depth=5)

clf.fit(train[features], train.target)
joblib.dump(clf, 'classifier.pkl', compress=True)

pred = clf.predict_proba(test[features])[:, 1]

bdt = pred.copy()

import itertools
xy = [
    i * j
    for i, j in itertools.product([10.**i for i in range(-8, 0)], [1, 2, 4, 8])
] + [1]
plt.plot(xy, xy, color='grey', linestyle='--')
plt.xlim([10**-5, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

#draw baseline point
Exemplo n.º 58
0
x_train, x_test, y_train, y_test = train_test_split(df_cp,
                                                    train_Y,
                                                    test_size=0.25,
                                                    random_state=4)

########## model start
from sklearn.ensemble import GradientBoostingClassifier
gdbt = GradientBoostingClassifier(learning_rate=0.01)

# 訓練模型
gdbt.fit(x_train, y_train)

# 預測測試集
y_pred = gdbt.predict(x_test)

y_pred_proba = gdbt.predict_proba(x_test)[:, 1]

########## model end

########## 糢型憑估  start
from sklearn import datasets, metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

check_view = pd.DataFrame({'pred_poi': y_pred_proba, 'poi': y_test})
check_view = check_view.sort_values(by=['pred_poi'])

acc = accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)
var_confusion_matrix = confusion_matrix(y_test,
                                        y_pred,
Exemplo n.º 59
0
        subsample=0.6,
        max_depth=4,
        learning_rate=0.04,
        max_features=50,
    )
    model_rf.fit(X, y)
    model_extraTrees.fit(X, y)
    rbf_model.fit(X, y)
    linear_model.fit(X, y)
    boost_model.fit(X, y)

    #get predictions from the machines for the data!

    extrees_prob = model_extraTrees.predict_proba(org1)
    rftrees_prob = model_rf.predict_proba(org1)
    grb_prob = boost_model.predict_proba(org1)
    rbf_prob = rbf_model.predict_proba(org1)
    preds_linear = linear_model.predict_proba(org1)

    grb_prob = grb_prob[:, 1]  # Get only the ones'column probabilities
    rbf_prob = rbf_prob[:, 1]
    preds_linear = preds_linear[:, 1]
    extrees_prob = extrees_prob[:, 1]
    rftrees_prob = rftrees_prob[:, 1]

    threshhold = 0.9
    #hold the index locations of the high scoring samples:
    #    indexs_bestPreds_GRB = np.where(grb_prob>threshhold)
    indexs_bestPreds_RF = get_threshholdLocs(rftrees_prob, 0.75)
    indexs_bestPreds_GRB = get_threshholdLocs(grb_prob, threshhold)
    indexs_bestPreds_RBF = get_threshholdLocs(rbf_prob, 0.6)
Exemplo n.º 60
-1
def train_gb():
    gb = GradientBoostingClassifier(n_estimators=100)
    gb.fit(train_features, train_labels)
    probs = gb.predict_proba(test_features)[:,1]
    save_submission(outfile+"_gb", ids, probs)
    print "created submission for gb"
    print cross_val_score(gb, train_features, train_labels, scoring="log_loss")