def train_classifiers(X_data, y_data):
    ############ Linear SVM: 0.908 #############
    clf_LSVM = svm.SVC(kernel = 'linear')
    clf_LSVM.fit(X_data, y_data)
    
    ############ MultinomialNB: 0.875 #############
    clf_MNB = MultinomialNB()
    clf_MNB.fit(X_data, y_data)
    
    ############ Random Forest: 0.910 #############
    clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy')
    clf_RF.fit(X_data, y_data)
    
    ############ Extra Tree: 0.915 ##################
    clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0)
    clf_ETC.fit(X_data, y_data)
    
    ############ AdaBoost: 0.88 ##################
    clf_Ada = AdaBoostClassifier()
    clf_Ada.fit(X_data, y_data)
    
    ############ rbf SVM: 0.895 #############
    clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf')
    clf_rbf.fit(X_data, y_data)
    
    ############ GradientBoosting: 0.88 #############
    clf_GBC = GradientBoostingClassifier()
    clf_GBC.fit(X_data, y_data)
    
    return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC    
def test_plot_partial_dependence_multiclass(pyplot):
    # Test partial dependence plot function on multi-class input.
    iris = load_iris()
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
    clf.fit(iris.data, iris.target)

    grid_resolution = 25
    plot_partial_dependence(clf, iris.data, [0, 1],
                            target=0,
                            grid_resolution=grid_resolution)
    fig = pyplot.gcf()
    axs = fig.get_axes()
    assert len(axs) == 2
    assert all(ax.has_data for ax in axs)

    # now with symbol labels
    target = iris.target_names[iris.target]
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
    clf.fit(iris.data, target)

    grid_resolution = 25
    plot_partial_dependence(clf, iris.data, [0, 1],
                            target='setosa',
                            grid_resolution=grid_resolution)
    fig = pyplot.gcf()
    axs = fig.get_axes()
    assert len(axs) == 2
    assert all(ax.has_data for ax in axs)
def test_check_inputs_predict():
    # X has wrong shape
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(X, y)

    x = np.array([1.0, 2.0])[:, np.newaxis]
    assert_raises(ValueError, clf.predict, x)

    x = np.array([[]])
    assert_raises(ValueError, clf.predict, x)

    x = np.array([1.0, 2.0, 3.0])[:, np.newaxis]
    assert_raises(ValueError, clf.predict, x)

    clf = GradientBoostingRegressor(n_estimators=100, random_state=1)
    clf.fit(X, rng.rand(len(X)))

    x = np.array([1.0, 2.0])[:, np.newaxis]
    assert_raises(ValueError, clf.predict, x)

    x = np.array([[]])
    assert_raises(ValueError, clf.predict, x)

    x = np.array([1.0, 2.0, 3.0])[:, np.newaxis]
    assert_raises(ValueError, clf.predict, x)
Exemplo n.º 4
0
def ctr_gbdt(model='sklearn-clicklog', from_cache=False, train_dataset_length=100000, test_dataset_length=100000):
    TRAIN_FILE, TEST_FILE = create_dataset(model, from_cache, train_dataset_length, test_dataset_length)

    prediction_model = GradientBoostingClassifier(
        loss='deviance',
        learning_rate=0.1,
        n_estimators=30,
        subsample=1.0,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_depth=5,
    )

    x_train, y_train = clean_data(TRAIN_FILE)
    x_test, y_test = clean_data(TEST_FILE)

    with Timer('fit model'):
        prediction_model.fit(x_train, y_train)

    with Timer('evaluate model'):
        y_prediction_train = prediction_model.predict_proba(x_train)
        y_prediction_test = prediction_model.predict_proba(x_test)

        loss_train = log_loss(y_train, y_prediction_train)
        loss_test = log_loss(y_test, y_prediction_test)

    print 'loss_train: %s' % loss_train
    print 'loss_test: %s' % loss_test
def test_staged_predict_proba():
    # Test whether staged predict proba eventually gives
    # the same prediction.
    X, y = datasets.make_hastie_10_2(n_samples=1200,
                                     random_state=1)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingClassifier(n_estimators=20)
    # test raise NotFittedError if not fitted
    assert_raises(NotFittedError, lambda X: np.fromiter(
        clf.staged_predict_proba(X), dtype=np.float64), X_test)

    clf.fit(X_train, y_train)

    # test if prediction for last stage equals ``predict``
    for y_pred in clf.staged_predict(X_test):
        assert_equal(y_test.shape, y_pred.shape)

    assert_array_equal(clf.predict(X_test), y_pred)

    # test if prediction for last stage equals ``predict_proba``
    for staged_proba in clf.staged_predict_proba(X_test):
        assert_equal(y_test.shape[0], staged_proba.shape[0])
        assert_equal(2, staged_proba.shape[1])

    assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
def test_gradient_boosting_early_stopping():
    X, y = make_classification(n_samples=1000, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=1000,
                                     n_iter_no_change=10,
                                     learning_rate=0.1, max_depth=3,
                                     random_state=42)

    gbr = GradientBoostingRegressor(n_estimators=1000, n_iter_no_change=10,
                                    learning_rate=0.1, max_depth=3,
                                    random_state=42)

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=42)
    # Check if early_stopping works as expected
    for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 24), (gbr, 1e-1, 13),
                                              (gbc, 1e-3, 36),
                                              (gbr, 1e-3, 28)):
        est.set_params(tol=tol)
        est.fit(X_train, y_train)
        assert_equal(est.n_estimators_, early_stop_n_estimators)
        assert est.score(X_test, y_test) > 0.7

    # Without early stopping
    gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                     max_depth=3, random_state=42)
    gbc.fit(X, y)
    gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1,
                                    max_depth=3, random_state=42)
    gbr.fit(X, y)

    assert gbc.n_estimators_ == 100
    assert gbr.n_estimators_ == 200
def test_gradient_boosting_validation_fraction():
    X, y = make_classification(n_samples=1000, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=100,
                                     n_iter_no_change=10,
                                     validation_fraction=0.1,
                                     learning_rate=0.1, max_depth=3,
                                     random_state=42)
    gbc2 = clone(gbc).set_params(validation_fraction=0.3)
    gbc3 = clone(gbc).set_params(n_iter_no_change=20)

    gbr = GradientBoostingRegressor(n_estimators=100, n_iter_no_change=10,
                                    learning_rate=0.1, max_depth=3,
                                    validation_fraction=0.1,
                                    random_state=42)
    gbr2 = clone(gbr).set_params(validation_fraction=0.3)
    gbr3 = clone(gbr).set_params(n_iter_no_change=20)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    # Check if validation_fraction has an effect
    gbc.fit(X_train, y_train)
    gbc2.fit(X_train, y_train)
    assert gbc.n_estimators_ != gbc2.n_estimators_

    gbr.fit(X_train, y_train)
    gbr2.fit(X_train, y_train)
    assert gbr.n_estimators_ != gbr2.n_estimators_

    # Check if n_estimators_ increase monotonically with n_iter_no_change
    # Set validation
    gbc3.fit(X_train, y_train)
    gbr3.fit(X_train, y_train)
    assert gbr.n_estimators_ < gbr3.n_estimators_
    assert gbc.n_estimators_ < gbc3.n_estimators_
def test_plot_partial_dependence_input():
    # Test partial dependence plot function input checks.
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)

    # not fitted yet
    assert_raises(ValueError, plot_partial_dependence,
                  clf, X, [0])

    clf.fit(X, y)

    assert_raises(ValueError, plot_partial_dependence,
                  clf, np.array(X)[:, :0], [0])

    # first argument must be an instance of BaseGradientBoosting
    assert_raises(ValueError, plot_partial_dependence,
                  {}, X, [0])

    # must be larger than -1
    assert_raises(ValueError, plot_partial_dependence,
                  clf, X, [-1])

    # too large feature value
    assert_raises(ValueError, plot_partial_dependence,
                  clf, X, [100])

    # str feature but no feature_names
    assert_raises(ValueError, plot_partial_dependence,
                  clf, X, ['foobar'])

    # not valid features value
    assert_raises(ValueError, plot_partial_dependence,
                  clf, X, [{'foo': 'bar'}])
def test_plot_partial_dependence_multiclass():
    # Test partial dependence plot function on multi-class input.
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
    clf.fit(iris.data, iris.target)

    grid_resolution = 25
    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
                                       label=0,
                                       grid_resolution=grid_resolution)
    assert len(axs) == 2
    assert all(ax.has_data for ax in axs)

    # now with symbol labels
    target = iris.target_names[iris.target]
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
    clf.fit(iris.data, target)

    grid_resolution = 25
    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
                                       label='setosa',
                                       grid_resolution=grid_resolution)
    assert len(axs) == 2
    assert all(ax.has_data for ax in axs)

    # label not in gbrt.classes_
    assert_raises(ValueError, plot_partial_dependence,
                  clf, iris.data, [0, 1], label='foobar',
                  grid_resolution=grid_resolution)

    # label not provided
    assert_raises(ValueError, plot_partial_dependence,
                  clf, iris.data, [0, 1],
                  grid_resolution=grid_resolution)
Exemplo n.º 10
0
def test_classification_synthetic():
    # Test GradientBoostingClassifier on synthetic dataset used by
    # Hastie et al. in ESLII Example 12.7.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    for loss in ('deviance', 'exponential'):

        gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=1,
                                          max_depth=1, loss=loss,
                                          learning_rate=1.0, random_state=0)
        gbrt.fit(X_train, y_train)
        error_rate = (1.0 - gbrt.score(X_test, y_test))
        assert error_rate < 0.09, \
            "GB(loss={}) failed with error {}".format(loss, error_rate)

        gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=1,
                                          max_depth=1,
                                          learning_rate=1.0, subsample=0.5,
                                          random_state=0)
        gbrt.fit(X_train, y_train)
        error_rate = (1.0 - gbrt.score(X_test, y_test))
        assert error_rate < 0.08, ("Stochastic GradientBoostingClassifier(loss={}) "
                                   "failed with error {}".format(loss, error_rate))
def test_partial_dependecy_input():
    # Test input validation of partial dependence.
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
    clf.fit(X, y)

    assert_raises(ValueError, partial_dependence,
                  clf, [0], grid=None, X=None)

    assert_raises(ValueError, partial_dependence,
                  clf, [0], grid=[0, 1], X=X)

    # first argument must be an instance of BaseGradientBoosting
    assert_raises(ValueError, partial_dependence,
                  {}, [0], X=X)

    # Gradient boosting estimator must be fit
    assert_raises(ValueError, partial_dependence,
                  GradientBoostingClassifier(), [0], X=X)

    assert_raises(ValueError, partial_dependence, clf, [-1], X=X)

    assert_raises(ValueError, partial_dependence, clf, [100], X=X)

    # wrong ndim for grid
    grid = np.random.rand(10, 2, 1)
    assert_raises(ValueError, partial_dependence, clf, [0], grid=grid)
Exemplo n.º 12
0
 def train_GBDT(self):
     samples=self.trainset.values
     target=self.trainlabel.values
     classifier_GB=GradientBoostingClassifier(n_estimators=1000)
     classifier_GB.fit(samples,target)
     
     return classifier_GB
def model_train_ensemble(X1,Y1,Save = False, modelname = None):
    
    X1,Y1 = DowmSample(X1,Y1,9)
    
#     model = RandomForestClassifier(n_estimators=100,random_state=1)
    model = GradientBoostingClassifier(n_estimators=100,max_leaf_nodes=5, subsample=0.7, learning_rate=0.1, random_state=1)
#     model = LogisticRegression('l2')
    model.fit(X1, Y1.ravel())
    
    # 保存模型
    if Save == True:
        f = open(modelname,'w')
        pickle.dump(model, f)
        f.close()
    
    print '\n -------------- Training is over ----------------------'    
    return model





    

    
Exemplo n.º 14
0
def main():
	makeSub = True
	featureImportance = False
	cvfold = True
	df = pd.read_csv('../data/cprobTrain15NA.csv')

	X, y = np.array(pd.read_csv('../data/train.csv',usecols=range(1,9))), np.array(pd.read_csv('../data/train.csv').ACTION)
	X = np.hstack((X,np.array(df)))

	params = {'max_depth':4, 'subsample':0.5, 'verbose':0, 'random_state':1337,
		'min_samples_split':10, 'min_samples_leaf':10, 'max_features':10,
		'n_estimators': 350, 'learning_rate': 0.05}	

	clf = GradientBoostingClassifier(**params)
	prefix = 'lib/gbm350d4m10c15'
	if cvfold:
		c = classifier.Classifier(X,y)
		c.validate(clf,nFolds=10,out=prefix+'Train.csv')

	if makeSub:
		Xt = np.array(pd.read_csv('../data/test.csv',usecols=range(1,9)))
		Xt = np.hstack((Xt,np.array(pd.read_csv('../data/cprobTest15NA.csv'))))
		clf.fit(X,y)
		y_ = clf.predict_proba(Xt)[:,1]
		out = pd.read_csv('subs/nbBaseTest.csv')
		out.ACTION = y_
		out.to_csv(prefix+'Test.csv',index=False)

	if featureImportance:
		print "Feature ranking:"
		importances = clf.feature_importances_
		indices = np.argsort(importances)[::-1]
		np.savetxt('indices.txt',indices,delimiter=',')
		for f in xrange(df.shape[1]):
			print "%d. feature (%s,%f)" % (f + 1, df.columns[indices[f]], importances[indices[f]])
Exemplo n.º 15
0
def PlotFeaturesImportance(X,y,featureNames,dataName):
    '''
    Plot the relative contribution/importance of the features.
    Best to reduce to top X features first - for interpretability
    Code example from:
    http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/
    '''
    gbc = GradientBoostingClassifier(n_estimators=40)
    gbc.fit(X, y)
    # Get Feature Importance from the classifier
    feature_importance = gbc.feature_importances_
    # Normalize The Features
    feature_importance = 100 * (feature_importance / feature_importance.max())
    sorted_idx = numpy.argsort(feature_importance)
    pos = numpy.arange(sorted_idx.shape[0]) + 4.5
    # pos = numpy.arange(sorted_idx.shape[0])
    # plt.figure(figsize=(16, 12))
    plt.figure(figsize=(14, 9), dpi=250)
    plt.barh(pos, feature_importance[sorted_idx], align='center', color='#7A68A6')
    #plt.yticks(pos, numpy.asanyarray(df.columns.tolist())[sorted_idx]) #ORIG
    plt.yticks(pos, numpy.asanyarray(featureNames)[sorted_idx])

    plt.xlabel('Relative Importance')
    plt.title('%s: Top Features' %(dataName))
    plt.grid('off')
    plt.ion()
    plt.show()
    plt.savefig(str(dataName)+'TopFeatures.png',dpi=200)
Exemplo n.º 16
0
def train():
    posi_result = {}
    train_feature, test_feature, train_id_list, test_id_list, train_tar_list = merge_feature(feature_str)
    tmp1 = [m < 32 for m in trainTarList]
    tmp1 = np.array(tmp1)
    # train_feature = train_feature[tmp1]
    target_list = np.array(trainTarList)
    target_list = target_list[tmp1]
    # train_id_list = np.array(train_id_list)
    # train_id_list = train_id_list[tmp1]
    c_feature = trainFeature.columns[:]
    clf1 = RandomForestClassifier(n_estimators=200, min_samples_split=17)
    clf1.fit(trainFeature[c_feature], target_list)
    # rf_preds = clf1.predict(test_feature)
    rf_prob = clf1.predict_proba(test_feature)
    gbdt1 = GradientBoostingClassifier(n_estimators=150, min_samples_split=17)
    gbdt1.fit(trainFeature[c_feature], target_list)
    # gbdt_preds = gbdt1.predict(test_feature)
    gbdt_prob = gbdt1.predict_proba(test_feature)
    all_prob = rf_prob + gbdt_prob
    all_preds = []
    print all_prob.shape
    for k in range(all_prob.shape[0]):
        prob1 = list(allProb[k, :])
        ind1 = prob.index(max(prob1))
        allPreds.append(ind1)
    for j in range(len(all_preds)):
        all_pre_name = dl.get_num_position(all_preds[j])
        posi_result[test_id_list[j]] = all_pre_name
    return posi_result
def test_max_feature_auto():
    """Test if max features is set properly for floats and str. """
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
    _, n_features = X.shape

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    gbrt = GradientBoostingClassifier(n_estimators=1, max_features='auto')
    gbrt.fit(X_train, y_train)
    assert_equal(gbrt.max_features_, int(np.sqrt(n_features)))

    gbrt = GradientBoostingRegressor(n_estimators=1, max_features='auto')
    gbrt.fit(X_train, y_train)
    assert_equal(gbrt.max_features_, n_features)

    gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.3)
    gbrt.fit(X_train, y_train)
    assert_equal(gbrt.max_features_, int(n_features * 0.3))

    gbrt = GradientBoostingRegressor(n_estimators=1, max_features='sqrt')
    gbrt.fit(X_train, y_train)
    assert_equal(gbrt.max_features_, int(np.sqrt(n_features)))

    gbrt = GradientBoostingRegressor(n_estimators=1, max_features='log2')
    gbrt.fit(X_train, y_train)
    assert_equal(gbrt.max_features_, int(np.log2(n_features)))
Exemplo n.º 18
0
def gbc_gp_predict(train_x, train_y, test_x):
    feature_indexs = getTopFeatures(train_x, train_y)
    sub_x_Train = get_data(
        train_x,
        feature_indexs[:16],
        features.feature_pair_sub_list,
        features.feature_pair_plus_list,
        features.feature_pair_mul_list,
        features.feature_pair_divide_list[:20],
    )
    sub_x_Test = get_data(
        test_x,
        feature_indexs[:16],
        features.feature_pair_sub_list,
        features.feature_pair_plus_list,
        features.feature_pair_mul_list,
        features.feature_pair_divide_list[:20],
    )
    labels = toLabels(train_y)
    gbc = GradientBoostingClassifier(n_estimators=3000, max_depth=9)
    gbc.fit(sub_x_Train, labels)
    pred_probs = gbc.predict_proba(sub_x_Test)[:, 1]
    ind_test = np.where(pred_probs > 0.55)[0]
    gp_preds_part = gbc_gp_predict_part(sub_x_Train, train_y, sub_x_Test[ind_test])
    gp_preds = np.zeros(len(test_x))
    gp_preds[ind_test] = gp_preds_part
    return gp_preds
Exemplo n.º 19
0
def ada_boost():
    savefile = open('traindata.pkl', 'rb')
    (x_train, y_train, t1) = cPickle.load(savefile)
    savefile.close()
    savefile = open('testdata.pkl', 'rb')
    (x_test, t1, name1) = cPickle.load(savefile)
    savefile.close()
    
#    X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(
#    X, y, test_size=0.1, random_state=42)
    
    x_train = np.asarray(x_train,dtype=np.float32)
    y_train = np.asarray(y_train, dtype='int32')-1   
    
    nest = 190
    lr = .1
    md = 6
#    clf1 = DecisionTreeClassifier(max_depth=2)
#    clf = AdaBoostClassifier(clf1, n_estimators=200, learning_rate=.25)
    clf = GradientBoostingClassifier(n_estimators=nest, learning_rate=lr, max_depth=md, random_state=0)
#    clf = RandomForestClassifier(n_estimators=200) #.81
#    clf = ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=10, random_state=0,n_jobs=8) #.81
#    clf = KNeighborsClassifier(15)
    if 1:
        clf.fit(x_train, y_train)
        ypred = clf.predict_proba(x_test)
        y_str = ['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']
        kcsv.print_csv(ypred, name1, y_str,indexname='id')
        print (nest, lr, md) 
    
    if 0:
        multiclass_log_loss = make_scorer(score_func=logloss_mc, greater_is_better=True, needs_proba=True)
        scores = cross_val_score(clf, x_train, y_train, n_jobs=8, cv=5,scoring=multiclass_log_loss)
        print scores
        print (nest, lr, md, scores.mean())  
Exemplo n.º 20
0
def GradBoost(X_DS, Y_DS, X_train, X_test, y_train, y_test, Cl_Names = 'None', mask='None',Max_Depth=3):
#******************************************************************************

	from sklearn.ensemble import GradientBoostingClassifier as GBC #import library for machine learning analysis
	from sklearn.metrics import classification_report

	print 'Gradient Boosting: Training...' #notify the user about the status of the process 

	Gradient_Boosting_obj = GBC(max_depth=Max_Depth) #call the Gradient Boosting routine built in
	Gradient_Boosting_obj.fit(X_train, y_train) #fit the logistic model to the train data sets
	Pred_Train = Gradient_Boosting_obj.predict(X_train) #apply the logistic model to the train dataset
	Pred_Test = Gradient_Boosting_obj.predict(X_test) #apply the logistic model to the test dataset

	print 'Gradient Boosting: Completed!' #notify the user about the status of the process

	labels = len(np.unique(Y_DS)) #extract the labels from the classification classes
	Conf_M = np.zeros((labels,labels), dtype='int') #initialize the confusion matrix for the classification problem
	
	if Cl_Names != 'None':
		target_names = Cl_Names
	else:
		target_names = np.arange(len(np.unique(Y_DS))).astype(str).tolist()
	#end

	Conf_M = CM(y_test, Pred_Test,np.unique(Y_DS)) #calls the confusion matrix routine with the test set and prediction set

	print(classification_report(y_test, Pred_Test, target_names=target_names))  #print the performance indicators on the console

	return Gradient_Boosting_obj, Conf_M
 def get_n_fold_validation_score(self, fold=10):
     features = data.get_features()
     lables = data.get_lables()
     length = len(features)
     jump = length / fold
     index = 0
     k = 0
     scores = list()
     while k < fold:
         feature_test = features.iloc[index : (index + jump), :]
         lable_test = lables.iloc[index : (index + jump), :]
         feature_train_1, feature_train_2 = (
             features.iloc[0 : index - 1, :] if index != 0 else pd.DataFrame(),
             features.iloc[index + jump + 1 : length - 1],
         )
         feature_train = pd.concat([feature_train_1, feature_train_2])
         lable_train_1, lable_train_2 = (
             lables.iloc[0 : index - 1, :] if index != 0 else pd.DataFrame(),
             lables.iloc[index + jump + 1 : length - 1],
         )
         lable_train = pd.concat([lable_train_1, lable_train_2])
         index += jump
         k += 1
         classifier = GradientBoostingClassifier()
         classifier.fit(feature_train, lable_train["lable"].values)
         scores.append(accuracy_score(lable_test, classifier.predict(feature_test)))
     return sum(scores) / float(len(scores))
Exemplo n.º 22
0
    def gbdt_train(self, data, task_id, window=DEFAULT_WINDOW):
        """
        Train a gbdt model.

        :param data: Training dataset.
        :param task_id: The id of the training task.
        :param window: the length of window
        """
        X_train = []
        y_train = []
        features = self.__calculate_features(data, window)
        if features:
            return TSD_LACK_SAMPLE
        for index in features:
            X_train.append(index[0])
            y_train.append(index[1])
        X_train = np.array(X_train)
        y_train = np.array(y_train)
        try:
            grd = GradientBoostingClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth, learning_rate=self.learning_rate)
            grd.fit(X_train, y_train)
            model_name = MODEL_PATH + task_id + "_model"
            joblib.dump(grd, model_name)
        except Exception as ex:
            return TSD_TRAIN_ERR, str(ex)
        return TSD_OP_SUCCESS, ""
Exemplo n.º 23
0
class Blender(BaseEstimator, ClassifierMixin):
    def __init__(self, trained_clfs):
        self.clfs = trained_clfs
        # self.classifier = make_pipeline(OneHotEncoder(), DenseTransformer(),
        #                                 GradientBoostingClassifier())
        self.classifier = GradientBoostingClassifier()
        # self.classifier = make_pipeline(
        #     OneHotEncoder(), LogisticRegression(class_weight='auto'))

    def fit(self, data, target):
        # self.enc = LabelEncoder().fit(target)
        probs = self.transform_input(data)
        # self.classifier.fit(predictions, target)
        self.classifier.fit(probs, target)

    def predict(self, data):
        predictions = self.transform_input(data)
        return self.classifier.predict(predictions)

    def transform_input(self, data):
        probabilities = [clf.predict_proba(data) for clf in self.clfs]

        probabilities = np.array(probabilities)
        # features, samples = probabilities.shape
        n_clfs, samples, features = probabilities.shape
        probabilities = np.reshape(probabilities, (samples, n_clfs * features))
        probabilities[np.isnan(probabilities)] = 0
        return probabilities
def main():
    print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S'))
    testing_file = file('test.p', 'r')
    training_file = file('train.p', 'r')

    train = pickle.load(training_file)
    test = pickle.load(testing_file)

    testing_file.close()
    training_file.close()
    
    trainX = train[:,:-1]
    trainy = train[:,-1]
    
    testX = test[:,:-1]
    testy = test[:,-1]

    print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'GradientBoostingClassifier(n_estimators=1000)')
    clf = GradientBoostingClassifier(n_estimators=1000)
    clf.fit(trainX, trainy)

    print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S'))
    prediction = clf.predict(testX)
    print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))


    model_save_file = file('gradient_1000.p', 'w')
    pickle.dump(clf, model_save_file)
    model_save_file.close()
    print 'All done'
Exemplo n.º 25
0
def partial_dependence(df, y):
    '''
    INPUT: X = features
           y = target variable binary, imbalanced classes
    OUPUT: X = features oversampled to have balanced target classes
           y = target variable oversample to have balanced classes

    Discovers the minority class and then oversamples until eah class makes up
    50% of your data.
    '''
    X_train, X_test, y_train, y_test = oversample_train_test(df, y)
    # X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42)

    feature_engineering = Pipeline([
        ('lists', ListSplitter()),
        ('race', RaceDummies()),
        ('crime_sentence', CrimeAndSentence()),
        ('feat_eng', FeatureEngineer()),
        ('columns', ColumnFilter(prejudice=False))
    ])

    X = feature_engineering.fit_transform(X_train.copy(), y_train)
    X_test = feature_engineering.fit_transform(X_test.copy(), y_test)

    gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75)
    gbc.fit(X.copy(), y_train)
    most_imp = np.argsort(gbc.feature_importances_)[-6:]

    names = list(X_test.columns)
    feats = list(most_imp)
    fig, axs = plot_partial_dependence(gbc, X_test, feats, feature_names=names,
                                       n_jobs=3, grid_resolution=50)
Exemplo n.º 26
0
def predict(fea, df, t, t9):
    Un = df.columns == 'Blank'
    for f in Fea:
        '''        
        try:
            df[(f+'_y')] = df[(f+'_x')] - df[(f+'_y')]
            print(1)
        except:
            pass
        '''
        Un = Un | (df.columns == f)
        Un = Un | (df.columns == (f+'_x'))
        Un = Un | (df.columns == (f+'_y'))
    Un = Un & (df.columns != 'New_y')    
    clf = GradientBoostingClassifier()
    y = df[t].label
    X = df[t].ix[:,Un]
    X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.9, random_state = 1)
    clf.fit(X_train, y_train)
    re = 'Testing AUC: \t' + str(roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))  
    print re
    re =  'September AUC: \t' + str(roc_auc_score(df[t9].label,clf.predict_proba(df[t9].ix[:,Un])[:,1]))
    print re
    print(X.columns)
    print(clf.feature_importances_)
    return Un, clf
def run_gradient_boosting_classifier(data, _max_depth):
    (feature_train, feature_test, label_train, label_test) = train_test_split(data[:, 0:-1], data[:, -1].astype(int),
                                                                              test_size=0.25)
    # TODO: Vary Number of Estimators and Learning Rate
    gbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50, max_depth=_max_depth, verbose = True)
    gbc.fit(feature_train, label_train)
    training_error = gbc.score(feature_train, label_train)
    #cross_validation_score = cross_val_score(gbc, feature_train, label_train, cv=10)
    testing_error = gbc.score(feature_test, label_test)

    print "Random Forest Results for Max Depth:", _max_depth
    print "Training Accuracy:", training_error
    #print "10-fold Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (cross_validation_score.mean(), cross_validation_score.std() * 2)
    print "Testing Accuracy:", testing_error

    feature_importance = gbc.feature_importances_
    stddev = np.std([tree[0].feature_importances_ for tree in gbc.estimators_], axis=0)
    indices = np.argsort(feature_importance)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    for f in range(len(feature_importance)):
        print("%d. feature %d (%f)" % (f + 1, indices[f], feature_importance[indices[f]]))

    plot_feature_importance(feature_importance, indices, stddev, "gradient-boosted-classifier-feature-importance-depth-" + str(_max_depth))
def test_oob_improvement():
    """Test if oob improvement has correct shape and regression test. """
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1, subsample=0.5)
    clf.fit(X, y)
    assert clf.oob_improvement_.shape[0] == 100
    # hard-coded regression test - change if modification in OOB computation
    assert_array_almost_equal(clf.oob_improvement_[:5], np.array([0.19, 0.15, 0.12, -0.12, -0.11]), decimal=2)
Exemplo n.º 29
0
    def __init__(self, estimator,
                 phase, 
                 n_jobs, cv_k_fold, parameters,
                 X_train, y_train,
                 X_test, y_test):
        # estimator : ensemble学習器

        # cv : if train : get best parameter
        if phase == "train":
            clf = GradientBoostingClassifier()
            gscv = GridSearchCV(clf, parameters, 
                                verbose = 10, 
                                scoring = "f1",#scoring = "precision" or "recall"
                                n_jobs = n_jobs, cv = cv_k_fold)
            gscv.fit(X_train, y_train)
            self.best_params = gscv.best_params_
            
            clf.set_params(**gscv.best_params_)
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()

        estimator.set_params(**gscv.best_params_)
        self.estimator = estimator
        self.one_hot_encoding = None
Exemplo n.º 30
0
def mse_sklearn(x_train, x_test, y_train, y_test, n_estimators):
    clf = GradientBoostingClassifier(n_estimators=n_estimators,
                                     min_samples_leaf=MIN_SAMPLES_LEAF,
                                     max_depth=MAX_DEPTH)
    clf.fit(x_train, y_train)
    pred = clf.predict(x_test)
    return f1_score(y_test, pred)
Exemplo n.º 31
0
def callback(*args):
    clear_text()
    algorithm = str()
    algorithm = str(variable.get())
    if (algorithm == "K-Means Classifier"):
        knn = KNeighborsClassifier(n_neighbors=9)
        knn.fit(X_train, Y_train)
        acc_train = str(round(knn.score(X_train, Y_train) * 100, 5))
        acc_test = str(round(knn.score(X_test, Y_test) * 100, 5))
        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " + acc_train +
                          "%\n\nTest Set: " + acc_test + " %",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)
        for n_neighbors in neighbors_settings:
            # build the model
            knn = KNeighborsClassifier(n_neighbors=n_neighbors)
            knn.fit(X_train, Y_train)
            # record training set accuracy
            training_accuracy.append(knn.score(X_train, Y_train))
            # record test set accuracy
            test_accuracy.append(knn.score(X_test, Y_test))
        plt.title("K-Means")
        #plt.figure(figsize=(8,8))
        plt.plot(neighbors_settings,
                 training_accuracy,
                 label="training accuracy")
        #plt.plot(neighbors_settings, test_accuracy, label="test accuracy")
        plt.ylabel("Accuracy")
        plt.xlabel("n_neighbors")
        plt.legend()
        plt.show()
        plt.savefig('knn_compare_model')

    elif (algorithm == "Decision Tree Classifier"):

        tree = DecisionTreeClassifier()
        tree.fit(X_train, Y_train)
        acc_train = str(round(tree.score(X_train, Y_train) * 100, 5))
        acc_test = str(round(tree.score(X_test, Y_test) * 100, 5))
        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " + acc_train +
                          "%\n\nTest Set: " + acc_test + "%",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)

        plot_feature_importances_diabetes(tree)
        plt.savefig('feature_importance')
        plt.show()
        #plt.set_position(self, bottom, which='both')

    elif (algorithm == "Random Forest Classifier"):

        rf = RandomForestClassifier()
        rf.fit(X_train, Y_train)
        acc_train = str(round(rf.score(X_train, Y_train) * 100, 5))
        acc_test = str(round(rf.score(X_test, Y_test) * 100, 5))
        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " + acc_train +
                          "%\n\nTest Set: " + acc_test + "%",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)

        plot_feature_importances_diabetes(rf)
        plt.savefig('feature_importance_rf')
        plt.show()

    elif (algorithm == "Gradient Boosting Classifier"):
        #clear_text()
        gb = GradientBoostingClassifier()
        gb.fit(X_train, Y_train)
        acc_train = str(round(gb.score(X_train, Y_train) * 100, 5))
        acc_test = str(round(gb.score(X_test, Y_test) * 100, 5))
        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " + acc_train +
                          "%\n\nTest Set: " + acc_test + "%",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)

        plot_feature_importances_diabetes(gb)
        plt.savefig('feature_importance_gb')
        plt.show()

    elif (algorithm == "SV Classifier"):

        svc = SVC()
        svc.fit(X_train, Y_train)
        acc_train = str(round(svc.score(X_train, Y_train) * 100, 5))
        acc_test = str(round(svc.score(X_test, Y_test) * 100, 5))
        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " + acc_train +
                          "%\n\nTest Set: " + acc_test + "%",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)

        plot_feature_importances_diabetes(svc)
        plt.savefig('feature_importance_svc')
        plt.show()

    elif (algorithm == "Logistics Regression"):

        logreg = LogisticRegression()
        logreg.fit(X_train, Y_train)
        acc_train = str(round(logreg.score(X_train, Y_train) * 100, 5))
        acc_test = str(round(logreg.score(X_test, Y_test) * 100, 5))
        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " + acc_train +
                          "%\n\nTest Set: " + acc_test + "%",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)

        plt.figure(figsize=(8, 6))
        plt.plot(logreg.coef_.T, 'o', label="C=1")
        plt.xticks(range(diabetes.shape[1]), diabetes_features, rotation=90)
        plt.hlines(0, 0, diabetes.shape[1])
        plt.ylim(-5, 5)
        plt.xlabel("Feature")
        plt.ylabel("Coefficient magnitude")
        plt.legend()
        plt.savefig('log_coef')
        plt.show()

    elif (algorithm == "Logistics Regression(C=150)"):

        logreg100 = LogisticRegression(C=150).fit(X_train, Y_train)
        acc_train = str(round(logreg100.score(X_train, Y_train) * 100, 5))
        acc_test = str(round(logreg100.score(X_test, Y_test) * 100, 5))

        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " + acc_train +
                          "%\n\nTest Set: " + acc_test + "%",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)

        plt.figure(figsize=(8, 6))
        plt.plot(logreg100.coef_.T, 'o', label="C=1")
        plt.xticks(range(diabetes.shape[1]), diabetes_features, rotation=90)
        plt.hlines(0, 0, diabetes.shape[1])
        plt.ylim(-5, 5)
        plt.xlabel("Feature")
        plt.ylabel("Coefficient magnitude")
        plt.legend()
        plt.savefig('log_coef100')
        plt.show()

    elif (algorithm == "Logistics Regression(C=0.01)"):

        logreg001 = LogisticRegression(C=0.01).fit(X_train, Y_train)
        acc_train = str(round(logreg001.score(X_train, Y_train) * 100, 5))
        acc_test = str(round(logreg001.score(X_test, Y_test) * 100, 5))
        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " + acc_train +
                          "%\n\nTest Set: " + acc_test + "%",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)

        plt.figure(figsize=(8, 6))
        plt.plot(logreg001.coef_.T, 'o', label="C=1")
        plt.xticks(range(diabetes.shape[1]), diabetes_features, rotation=90)
        plt.hlines(0, 0, diabetes.shape[1])
        plt.ylim(-5, 5)
        plt.xlabel("Feature")
        plt.ylabel("Coefficient magnitude")
        plt.legend()
        plt.savefig('log_coef001')
        plt.show()

    elif (algorithm == "Decision Tree Classifier(Depth=3)"):
        \
                tree = DecisionTreeClassifier(max_depth=3, random_state=0)
        tree.fit(X_train, Y_train)
        acc_train = str(round(tree.score(X_train, Y_train) * 100, 5))
        acc_test = str(round(tree.score(X_test, Y_test) * 100, 5))

        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " + acc_train +
                          "%\n\nTest Set: " + acc_test + "%",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)

        plot_feature_importances_diabetes(tree)
        plt.savefig('feature_importance')
        plt.show()

    elif (algorithm == "Random Forest Classifier(n_estimators=150)"):

        rf = RandomForestClassifier(n_estimators=150, random_state=0)
        rf.fit(X_train, Y_train)
        acc_train = str(round(rf.score(X_train, Y_train) * 100, 5))
        acc_test = str(round(rf.score(X_test, Y_test) * 100, 5))

        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " + acc_train +
                          "%\n\nTest Set: " + acc_test + "%",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)

        plot_feature_importances_diabetes(rf)
        plt.savefig('feature_importance_rf')
        plt.show()

    elif (algorithm == "Gradient Boosting Classifier(Depth=1)"):

        gb1 = GradientBoostingClassifier(random_state=0, max_depth=1)
        gb1.fit(X_train, Y_train)
        acc_train = str(round(gb1.score(X_train, Y_train) * 100, 5))
        acc_test = str(round(gb1.score(X_test, Y_test) * 100, 5))

        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " + acc_train +
                          "%\n\nTest Set: " + acc_test + "%",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)

        plot_feature_importances_diabetes(gb1)
        plt.savefig('feature_importance_gb1')
        plt.show()

    elif (algorithm == "Gradient Boosting Classifier(Learning_rate=0.01)"):

        gb2 = GradientBoostingClassifier(random_state=0, learning_rate=0.01)
        gb2.fit(X_train, Y_train)
        acc_train = str(round(gb2.score(X_train, Y_train) * 100, 5))
        acc_test = str(round(gb2.score(X_test, Y_test) * 100, 5))
        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " + acc_train +
                          "%\n\nTest Set: " + acc_test + "%",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)

        plot_feature_importances_diabetes(gb2)
        plt.savefig('feature_importance_gb2')
        plt.show()

    elif (algorithm == "SV Classifier(Random_state=42)"):

        svc = SVC(kernel='linear', random_state=42)
        svc.fit(X_train, Y_train)
        acc_train = str(round(svc.score(X_train, Y_train) * 100, 5))
        acc_test = str(round(svc.score(X_test, Y_test) * 100, 5))

        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " + acc_train +
                          "%\n\nTest Set: " + acc_test + "%",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)

        plot_feature_importances_diabetes(svc)
        plt.savefig('feature_importance_svc(42)')
        plt.show()

    elif (algorithm == "Min Max Scalar"):

        scaler = MinMaxScaler()
        svc = SVC()

        X_train_scaled = scaler.fit_transform(X_train) * 100
        X_test_scaled = scaler.fit_transform(X_test) * 100
        svc.fit(X_train_scaled, Y_train)
        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " +
                          str(round(X_train_scaled, 5)) + "%\n\nTest Set: " +
                          str(round(X_test_scaled, 5)) + "%",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)

        plot_feature_importances_diabetes(svc)
        plt.savefig('feature_importance_svc(42)')
        plt.show()

    elif (algorithm == "SV Classifier(C=1000)"):

        svc = SVC(C=1000)
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.fit_transform(X_test)
        svc.fit(X_train_scaled, Y_train)
        acc_train = str(round(svc.score(X_train, Y_train) * 100, 5))
        acc_test = str(round(svc.score(X_test, Y_test) * 100, 5))

        label1 = tk.Label(root,
                          text=algorithm + "\n\nTraining Set: " + acc_train +
                          "%\n\nTest Set: " + acc_test + "%",
                          font=("arial", 13, "bold"))
        label1.place(x=300, y=180, width=700, height=400)

        plot_feature_importances_diabetes(svc)
        plt.savefig('feature_importance_svc(42)')
        plt.show()
# 
# Here are partial plots from a very simple model on the Titanic data.

# In[ ]:



titanic_data = pd.read_csv('../input/titanic/train.csv')
titanic_y = titanic_data.Survived
clf = GradientBoostingClassifier()
titanic_X_colns = ['PassengerId','Age', 'Fare',]
titanic_X = titanic_data[titanic_X_colns]
my_imputer = Imputer()
imputed_titanic_X = my_imputer.fit_transform(titanic_X)

clf.fit(imputed_titanic_X, titanic_y)
titanic_plots = plot_partial_dependence(clf, features=[1,2], X=imputed_titanic_X, 
                                        feature_names=titanic_X_colns, grid_resolution=8)


# These might seem surprising at first glance.  But they show some interesting insights:
# * Being young increased your odds of survival. This is consistent with historical recountings that they got women and children off the Titanic first.
# * People who paid more had better odds of survival.  It turns out that higher fares got you a cabin that was closer to the top of the boat, and may have given you better odds of getting a life-boat.
# 
# # Conclusion
# Partial dependence plots are a great way (though not the only way) to extract insights from complex models.  These can be incredibly powerful for communicating those insights to colleagues or non-technical users. 
# 
# There are a variety of opinions on how to interpret these plots when they come from non-experimental data.  Some claim you can conclude nothing about cause-and-effect relationships from data unless it comes from experiments. Others are more positive about what can be learned from non-experimental data (also called observational data). It's a divisive topic in the data science world, beyond the scope of this tutorial.
# 
# However most agree that these are useful to understand your model.  Also, given the messiness of most real-world data sources, it's also a good sanity check that your model is capturing realistic patterns.
# 
Exemplo n.º 33
0
if LabelEncoder_mapping is not None: 
		df_val	= input_val_data.apply(lambda x: LabelEncoder_mapping[x.name].transform(x))
else:
		df_val	= input_val_data[:]
	
independent_variable_val 	= df_val[independent_variable_name].values
dependent_variable_val 		= df_val[dependent_variable_name].values

############################################################################################################
### Build Random Forest - Development #######################

### Default [add later #oob_score=True, ]

#clf = 	GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(independent_variable, dependent_variable)
clf = 	GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
clf = 	clf.fit(independent_variable, dependent_variable)


#Selecting good features by Mean decrease impurity. 
model 	= GradientBoostingClassifier()
scores 	= selecting_good_features(independent_variable, dependent_variable, independent_variable_name, 100, 0.3)
print sorted([(round(np.mean(score), 4), feat) for feat, score in scores.items()], reverse=True)

#Running a different number of trees and see the effect of that on the accuracy of the prediction
iterate_tree_in_Gradient_Boosting_Classifier(25, independent_variable, dependent_variable)
plt.savefig('Number of Trees vs Accuracy.png')
plt.show()

#Generate a simple plot of training learning curve
title 		= "Learning Curves - Gradient Boosting Classifier"
cv 			= model_selection.ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)	#n_splits:30 (Try)
Exemplo n.º 34
0
clt_ext.fit(X, y)
score_ext = cross_val_score(clt_ext, X, y, cv=5).mean()
print(score_ext)

# In[ ]:

#Gradient Boost
import warnings
warnings.filterwarnings

clf_gb = GradientBoostingClassifier(n_estimators=1000,
                                    learning_rate=0.1,
                                    max_depth=3,
                                    subsample=0.5,
                                    random_state=0).fit(X, y)
clf_gb = clf_gb.fit(X, y)
score_gb = cross_val_score(clf_gb, X, y, cv=5).mean()
print(score_gb)

# In[ ]:

#Ada Boost
clf_ada = AdaBoostClassifier(n_estimators=400, learning_rate=0.1)
clf_ada.fit(X, y)
score_ada = cross_val_score(clf_ada, X, y, cv=5).mean()
print(score_ada)

# In[ ]:

#Extreme Gradient Boosting
clf_xgb = xgb.XGBClassifier(max_depth=2,
sgd_clf = SGDClassifier()
results = cross_val_score(sgd_clf, X_train_smote, y_train_smote, cv=kfold, scoring='accuracy')
print("Training Accuracy: %.3f" % (results.mean()*100.0))
sgd_clf.fit(X_train_smote, y_train_smote)
predictions = sgd_clf.predict(X_test)
print("Testing Accuracy:%.3f" % (accuracy_score(y_test, predictions)*100.0))
print('Confusion Matrix:\n',confusion_matrix(y_test, predictions))
print('Classification report\n', classification_report(y_test, predictions))

"""###**Gradient Boosting Classifier**"""

kfold = KFold(n_splits=10)
gb_clf = GradientBoostingClassifier()
results = cross_val_score(gb_clf, X_train_smote, y_train_smote, cv=kfold, scoring='accuracy')
print("Training Accuracy: %.3f" % (results.mean()*100.0))
gb_clf.fit(X_train_smote, y_train_smote)
predictions = gb_clf.predict(X_test)
print("Testing Accuracy:%.3f" % (accuracy_score(y_test, predictions)*100.0))
print('Confusion Matrix:\n',confusion_matrix(y_test, predictions))
print('Classification report\n', classification_report(y_test, predictions))

"""###**Random Forest**"""

from sklearn.ensemble import RandomForestClassifier
kfold = KFold(n_splits=10)
rf_clf = RandomForestClassifier()
results = cross_val_score(rf_clf, X_train_smote, y_train_smote, cv=kfold, scoring='accuracy')
print("Training Accuracy: %.3f" % (results.mean()*100.0))
rf_clf.fit(X_train_smote, y_train_smote)
predictions = rf_clf.predict(X_test)
print("Testing Accuracy:%.3f" % (accuracy_score(y_test, predictions)*100.0))
Exemplo n.º 36
0
        sys.exit(-1)
    else:
        dataset = sys.argv[1]
        store_dir = sys.argv[2]

    # Create several array for the data
    if dataset == 'random':
        X, y, T, valid = misc.generate_samples(N_SAMPLES, N_FEATURES, RND_SEED)
    else:
        raise ValueError('The dataset is not known. The possible choices are:'
                         ' random')

    # Fit the sklearn gradient boosting
    clf = GradientBoostingClassifier()
    clf.set_params(**params_sklearn)
    clf.fit(X, y)

    # Fit the xgboost gradient boosting
    xgb_training = xgb.DMatrix(
        X,
        label=y,
        missing=None,
        weight=None,
        silent=False,
        feature_names=None,
        feature_types=None)
    n_est = params_xgboost.pop('n_estimators')
    bst = xgb.train(params_xgboost, xgb_training, n_est)

    # Fit the LightGBM gradient boosting
    max_bin = params_lgbm.pop('max_bin')
Exemplo n.º 37
0
# 一、使用单一决策树进行模型训练和预测分析
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()  #使用默认配置初始化决策树分类器
dtc.fit(X_train, y_train)  #使用分割得到的训练数据进行模型学习
dtc_y_predict = dtc.predict(X_test)  #使用训练好的决策树模型对测试特征数据进行预测

# 二、使用随机森林分类器进行集成模型的训练以及预测分析
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_y_pred = rfc.predict(X_test)

# 三、使用梯度提升决策树进行集成模型的训练以及预测分析
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_y_pred = gbc.predict(X_test)

#集成模型对泰坦尼克号乘客是否生还的预测性能
from sklearn.metrics import classification_report

#输出单一决策树在测试集上的分类准确性,以及精确率、召回率、F1指标
print('单一决策树的准确率是:', dtc.score(X_test, y_test))
print(classification_report(dtc_y_predict, y_test))

#输出随机森林分类器在测试集上的分类准确性,以及精确率、召回率、F1指标
print("随机分类器的准确率是:", rfc.score(X_test, y_test))
print(classification_report(rfc_y_pred, y_test))

#输出梯度提升决策树在测试集上的分类准确性,以及精确率、召回率、F1指标
print('梯度提升决策树的准确性是:', gbc.score(X_test, y_test))
Exemplo n.º 38
0
]

modelnum = 0
for elo_name, elo_df in train_df.groupby(train_df['elo_groups']):
    msg('working on elo group %s, of size %i' % (elo_name, elo_df.shape[0]))

    msg('computing perfect-move model')
    gbc = GradientBoostingClassifier(min_samples_split=500,
                                     min_samples_leaf=300,
                                     n_estimators=NUM_ESTIMATORS,
                                     verbose=1,
                                     subsample=0.5,
                                     learning_rate=0.2)
    X = elo_df[features]
    y = (elo_df['clipped_movergain'] == 0)
    gbc.fit(X, y)
    joblib.dump([elo_name, 1.0, gbc], '%s%i.p' % (blundermodel_dir, modelnum))
    modelnum = modelnum + 1

    for mg_quant in mg_quants:
        msg('computing mg_quant %f' % mg_quant)
        gbr = GradientBoostingRegressor(loss='quantile',
                                        alpha=mg_quant,
                                        min_samples_split=500,
                                        min_samples_leaf=300,
                                        n_estimators=NUM_ESTIMATORS,
                                        verbose=1,
                                        subsample=0.5,
                                        learning_rate=0.2)
        imperfect_df = elo_df[elo_df['clipped_movergain'] < 0]
        X = imperfect_df[features]
Exemplo n.º 39
0
 def gradBoost(X, y):
     from sklearn.ensemble import GradientBoostingClassifier
     gradientBoosting = GradientBoostingClassifier()
     gradientBoosting.fit(X, y)
     return (gradientBoosting)
Exemplo n.º 40
0
# List of comments
comments = []

# https://stackoverflow.com/questions/49100615/nltk-detecting-whether-a-sentence-is-interogative-or-not
nltk.download('nps_chat')
posts = nltk.corpus.nps_chat.xml_posts()
posts_text = [post.text for post in posts]
#divide train and test in 80 20
train_text = posts_text[:int(len(posts_text) * 0.8)]
test_text = posts_text[int(len(posts_text) * 0.2):]
#Get TFIDF features
vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                             min_df=0.001,
                             max_df=0.7,
                             analyzer='word')
X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)
y = [post.get('class') for post in posts]
y_train = y[:int(len(posts_text) * 0.8)]
y_test = y[int(len(posts_text) * 0.2):]
gb = GradientBoostingClassifier(n_estimators=400, random_state=0)
gb.fit(X_train, y_train)

question_comments = []
for comment in comments:
    type_of_comment = gb.predict(vectorizer.transform([comment]))
    if (type_of_comment == 'ynQuestion' or type_of_comment == 'whQuestion'
            or '?' in comment):
        question_comments.append(comment)
question_comments
Exemplo n.º 41
0
    'U_behaviors_sum10', 'Item_sale10', 'Item_sale5', 'Item_sale3',
    'Item_sale1', 'car5', 'car4', 'car3', 'car2', 'car1', 'buy5', 'buy4',
    'buy3', 'buy2', 'buy1', 'I_order10', 'I_order5', 'I_order3', 'I_order1',
    'I_buyer10', 'I_buyer5', 'I_buyer3', 'I_buyer1', 'behav1', 'behav2',
    'behav3', 'behav4', 'last_time'
]

df_train = pd.read_csv("train_feature.csv")
df_validation = pd.read_csv("validation_feature.csv")
#数据归一化处理

ui = df_train[["user_id", "item_id"]]
samples = df_train[features]
target = df_train["tag"]
classifier = GradientBoostingClassifier(n_estimators=200,
                                        learning_rate=1.0,
                                        max_depth=5,
                                        random_state=0)
classifier.fit(samples, target)  # 训练数据来学习,不需要返回值

validation_feature = df_validation[features]
x = classifier.predict(validation_feature)  # 测试数据,分类返回标记
print x
validation_ui = df_validation[["user_id", "item_id"]]
validation_ui["tag"] = x
validation_result = validation_ui[validation_ui.tag == 1][[
    "user_id", "item_id"
]]
os.chdir('..')
validation_result.to_csv("predict_v_Gbrt.csv", index=False)
from time import time

fold1_df = load_dataframe(filename='fold1_NA_features.dat')
fold2_df = load_dataframe(filename='fold2_NA_features.dat')

del fold1_df['id']
del fold2_df['id']

n_features = int(len(fold1_df.columns) / 4)
p0 = time()
clf = GradientBoostingClassifier('deviance',
                                 learning_rate=0.05,
                                 n_estimators=100,
                                 max_features=n_features)

clf.fit(fold1_df.iloc[:, 1:], fold1_df.iloc[:, 0])
preds_ens = clf.predict_proba(fold2_df.iloc[:, 1:])[:, 1]
print(time() - p0)

## Ensemble the predictions
true_values = fold2_df['label']
df, best_index = f1_scores_plot(preds_ens, true_values)
df['f1_score'][best_index]  #Li

### Check perfomance on fold3
fold3_df = load_dataframe(filename='fold3_NA_features.dat')
del fold3_df['id']
dw_cols = [x for x in fold1_df.columns if x[-2:] == 'dw' and x[:3] == 'pca']
fold3_df[dw_cols] = np.log(np.array(fold3_df[dw_cols]))
fold3_df = fold3_df.replace([-np.inf], 0)
x_test = fold3_df[[x for x in fold3_df.columns if x != 'label']]
Exemplo n.º 43
0
class qbc_dcw:
    
    def __init__(self,distance_scope=2,each_size=30):
        assert balance_scale>=0 and balance_scale<=1,"balance_scale must in [0,1]"
        assert bigger_par>=0,"bigger_par must be positive"
        assert distance_scope>=2,"distance_scope must greater than 2"
        assert each_size>0,"each_size must be positive"
        self.distance_scope = distance_scope
        self.each_size = each_size
        self._X_train = None
        self._y_train = None
        
    def fit(self,X_train,y_train):
        assert X_train.shape[0]==y_train.shape[0],"the size of X_train must be equal to the size of y_train"
        
        self._X_train = X_train
        self._y_train = y_train

        choice_list = [i for i in range(len(self._X_train))]
        random.shuffle(choice_list)
    
        num1 = int(len(self._X_train)/3)
        num2 = int(len(self._X_train)*2/3)
    
        x_init1 = pd.concat([self._X_train.iloc[choice_list[:num1]]])
        x_init2 = pd.concat([self._X_train.iloc[choice_list[num1:num2]]])
        x_init3 = pd.concat([self._X_train.iloc[choice_list[num2:]]])
    
        y_init1 = pd.concat([self._y_train.iloc[choice_list[:num1]]])
        y_init2 = pd.concat([self._y_train.iloc[choice_list[num1:num2]]])
        y_init3 = pd.concat([self._y_train.iloc[choice_list[num2:]]])
    
        self.gb_clf1 = GradientBoostingClassifier()
        self.gb_clf2 = GradientBoostingClassifier()
        self.gb_clf3 = GradientBoostingClassifier()

        self.gb_clf1.fit(x_init1,y_init1)
        self.gb_clf2.fit(x_init2,y_init2)
        self.gb_clf3.fit(x_init3,y_init3)
        
        return self
    
    def predict(self,X_predict):
        assert self._X_train is not None and self._y_train is not None,"must fit before predict!"
        assert X_predict.shape[1] == self._X_train.shape[1],"the feature number of X_predict must be equal to X_train"
        
        scores_sort = self.__predict(X_predict)
        scores_sort = np.array(scores_sort)
        scores_sorted = np.argsort(scores_sort)
        
        X_output = X_predict.iloc[scores_sorted[-self.each_size:]]
        
        return X_output
                                    
    def __scores_func(self,proba):
        scores_sort = []
        #proba形如[[0.1,0.9],[0.4,0.6],[0.7,0.3]]
        for sc in proba:
            col = 0
            for p in sc:
                #避免出现log0
                if p in [0,1]:
                    col += 0
                else:
                    col += -p*math.log(p,math.e)
            scores_sort.append(col)
        return scores_sort
    
    def __predict(self,x_choice):
                                
        proba1 = self.gb_clf1.predict_proba(x_choice)
        proba2 = self.gb_clf2.predict_proba(x_choice)
        proba3 = self.gb_clf3.predict_proba(x_choice)
        
        scores1_sort = self.__scores_func(proba1)  
        scores2_sort = self.__scores_func(proba2)
        scores3_sort = self.__scores_func(proba3)
        
        x_all = pd.concat([self._X_train,x_choice])                                 
        neigh = NearestNeighbors()
        neigh.fit(x_all)
        distance_number = neigh.kneighbors([x_choice.iloc[i] for i in range(len(x_choice))], self.distance_scope, return_distance=False)
                                           
        score_weight1 = self.gb_clf1.score(self._X_train,self._y_train)
        score_weight2 = self.gb_clf2.score(self._X_train,self._y_train)
        score_weight3 = self.gb_clf3.score(self._X_train,self._y_train)
        
        scores_sort = []
        for i in range(len(scores1_sort)):
            diversity = pairwise_distances([x_choice.iloc[i]],x_all.iloc[distance_number[i][1:2]],metric="cosine").sum()
            col = max(scores1_sort[i]*score_weight1,scores2_sort[i]*score_weight2,scores3_sort[i]*score_weight3)+100*diversity
            scores_sort.append(col)

        return scores_sort
    
    def __repr__(self):
        return "qbc_ddbcw(distance_scope=%d,each_size=%d)"%(self.distance_scope,self.each_size)
Exemplo n.º 44
0
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting Logistic Regression to the Training set
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier(loss='deviance',
                                        n_estimators=500,
                                        learning_rate=0.001,
                                        criterion='friedman_mse',
                                        random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)
#second classifier for the prediction
y_pred1 = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(
    np.arange(start=X_set[:, 0].min() - 1,
Exemplo n.º 45
0
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score , confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

data = load_breast_cancer()
X = data.data
y = data.target

skf = StratifiedKFold(n_splits = 5  )
p_rf = np.zeros(y.shape[0])


for n in range(10,100,10):
    for train,test in skf.split(X,y):
        x_train = X[train]
        x_test = X[test]
        y_train = y[train]
        y_test = y[test]
        
        clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        cm = confusion_matrix(y_test, y_pred)
        print(cm)

    print('score For' ,n , '   '  ,  accuracy_score(y_test , y_pred))



    max_depth=1,
    #     random_state=0,
    # max_leaf_nodes=10
)

q = 10
scores = cross_val_score(clfGB, x_treino, y_treino.ravel(), cv=q)

print(
    #         f'k = {k}',
    f'scores:{scores}',
    f'acurácia média = {round(100*mean(scores),2)} %')

clfRF.fit(x_treino, y_treino.ravel())

clfGB.fit(x_treino, y_treino.ravel())

atributos_selecionados = [
    'sexo_ ',
    'sexo_M',
    'sexo_F',
    'sexo_N',
    'possui_email',
    'local_onde_reside',
    'tipo_endereco',
    'idade',
    'estado_civil',
    'qtde_dependentes',
    'dia_vencimento',
    'possui_telefone_residencial',
    'meses_na_residencia',
Exemplo n.º 47
0
X = iris.data
y = iris.target
X
y

# Split and Randomize Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=33)
X_train
X_test

# Step 2 Define Classifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
clf2 = GradientBoostingClassifier()
# Step 3 Train the Classifier
clf2.fit(X_train, y_train)
pred2 = clf2.predict(X_test)
# Step 4: Evaluate the Classifier
print("GBC accuracy score : ", accuracy_score(y_test, pred2))

# Step 5: Save the Model
from sklearn.externals import joblib
joblib.dump(clf, 'iris.pkl')
# Step 6: Load the Model & Prediction
clf = joblib.load('iris.pkl')
clf
Exemplo n.º 48
0
         markerfacecolor='blue',
         markersize=10)
plt.title('Variation of accuracy by the number of trees')
plt.xlabel('Number of trees')
plt.ylabel('Mean Accuracy')

n = np.array(np.where(
    accuracy == np.max(accuracy))) + 1  # because the index starts at 0
n = n[0, 0]
n

gbm = GradientBoostingClassifier(n_estimators=5,
                                 learning_rate=0.3,
                                 max_depth=2)
# Train the model using the training sets and check score
gbm.fit(all_X, all_y)
#Predict Output
y_pred = gbm.predict(test_X)
print(confusion_matrix(test_y, y_pred))
print(classification_report(test_y, y_pred))

scores = cross_val_score(gbm, all_X, all_y, cv=kfold)
#scores.sort()
accuracy = scores.mean()

print("The accuracy for GBM is ", accuracy)

submit("submission_gbm.csv", gbm)

######################################### SVM #########################################
                                                    test_size=0.1,
                                                    random_state=508,
                                                    stratify=got_target)

# Building a gbm
gbm = GradientBoostingClassifier(
    loss='deviance',
    learning_rate=1.5,
    n_estimators=100,
    max_depth=3,
    criterion='friedman_mse',
    warm_start=False,
    random_state=508,
)

gbm_basic_fit = gbm.fit(X_train, y_train)

gbm_basic_predict = gbm_basic_fit.predict(X_test)

# Training and Testing Scores
print('Training Score', gbm_basic_fit.score(X_train, y_train).round(4))
print('Testing Score:', gbm_basic_fit.score(X_test, y_test).round(4))

cv_lr_3 = cross_val_score(gbm, got_data, got_target, cv=3, scoring='roc_auc')

print(pd.np.mean(cv_lr_3))

#########################
# Hyper Parameter Tuning
#########################
Exemplo n.º 50
0
    # use greedy method to find two subsets: true inliners and true outliers
    inliers, dummy = greedy_removal(all_reps, 0.5)
    dummy, outliers = greedy_removal(all_reps, 1.0)

    print len(inliers), len(outliers), len(all_reps)

    if True:
        # train a SVM classifier with true inliners and true outliers
        X_inliers = [all_reps[key] for key in inliers]
        X_outliers = [all_reps[key] for key in outliers]
        X = np.vstack((X_inliers, X_outliers))
        y = np.hstack(([0 for key in inliers], [1 for key in outliers]))
        #clf = svm.SVC()
        #clf = RandomForestClassifier(n_estimators=128)
        clf = GradientBoostingClassifier(n_estimators=128, learning_rate=1.0)
        clf = clf.fit(X, y)
        print X, y
        clf.fit(X, y)

        # perform classification using the SVM
        print clf.predict(all_reps.values())
        results = clf.predict(all_reps.values())

        inliers = [all_reps.keys()[i] for i in range(len(results)) if results[i] == 0]
        outliers = [all_reps.keys()[i] for i in range(len(results)) if results[i] == 1]

        print len(inliers), len(outliers)

    images_dir = os.path.dirname(os.path.realpath(all_reps.keys()[0]))
    main_person_dir = os.path.join(images_dir, 'main_person')
    other_persons_dir = os.path.join(images_dir, 'other_persons')
Exemplo n.º 51
0
# In[58]:


# Boosting


# In[69]:


#Boosting on oversampled data
from sklearn.ensemble import GradientBoostingClassifier
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_t, y_t)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_t, y_t)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test)))


# In[60]:


gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=0.75, max_features=2, max_depth=2, random_state=0)
gb_clf.fit(X_t, y_t)

print("Learning rate: ", learning_rate)
print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_t, y_t)))
print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test)))
sgd = SGDClassifier()
sgd.fit(x_train, y_train)
y_pred = sgd.predict(x_val)
acc_sgd = round(accuracy_score(y_pred, y_val) * 100, 2)
print "MODEL-9: Accuracy of Stochastic Gradient Descent : ", acc_sgd

#OUTPUT:-
#MODEL-9: Accuracy of Stochastic Gradient Descent :  71.07

#MODEL-10) Gradient Boosting Classifier
#------------------------------------------
from sklearn.ensemble import GradientBoostingClassifier

gbk = GradientBoostingClassifier()
gbk.fit(x_train, y_train)
y_pred = gbk.predict(x_val)
acc_gbk = round(accuracy_score(y_pred, y_val) * 100, 2)
print "MODEL-10: Accuracy of GradientBoostingClassifier : ", acc_gbk

#OUTPUT:-
#MODEL-10: Accuracy of Stochastic Gradient Descent :  84.77

#Let's compare the accuracies of each model!

models = pd.DataFrame({
    'Model': [
        'Logistic Regression', 'Gaussian Naive Bayes',
        'Support Vector Machines', 'Linear SVC', 'Perceptron', 'Decision Tree',
        'Random Forest', 'KNN', 'Stochastic Gradient Descent',
        'Gradient Boosting Classifier'
Exemplo n.º 53
0
param['bst:max_depth'] = 6
param['eval_metric'] = 'auc'
param['silent'] = 1
param['nthread'] = 4

plst = param.items() + [('eval_metric', '[email protected]')]

watchlist = [(xgmat, 'train')]
# boost 10 tres
num_round = 10
print('loading data end, start to boost trees')
print("training GBM from sklearn")
tmp = time.time()
gbm = GradientBoostingClassifier(n_estimators=num_round,
                                 max_depth=6,
                                 verbose=2)
gbm.fit(data, label)
print("sklearn.GBM costs: %s seconds" % str(time.time() - tmp))
#raw_input()
print("training xgboost")
threads = [1, 2, 4, 16]
for i in threads:
    param['nthread'] = i
    tmp = time.time()
    plst = param.items() + [('eval_metric', '[email protected]')]
    bst = xgb.train(plst, xgmat, num_round, watchlist)
    print("XGBoost with %d thread costs: %s seconds" %
          (i, str(time.time() - tmp)))

print('finish training')
X_test = np.array(X_test).astype(np.float)

y_test = temp[838860:1048575, 4]
y_test = np.array(y_test).astype(np.float)

print 'Training data shape: ', X_train.shape
print 'Training labels shape: ', y_train.shape
print 'Test data shape: ', X_test.shape
print 'Test labels shape: ', y_test.shape

# For comparison, compute PCA
pca = PCA(n_components=4)
Xtr = pca.fit_transform(
    X_train)  # Reconstruct signals based on orthogonal components
Xts = pca.fit_transform(
    X_test)  # Reconstruct signals based on orthogonal components

#clf2 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0) Train Accuracy = 0.921114 Test Accuracy = 0.878907
#clf2 = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05,max_depth=3, random_state=0) Train Accuracy = 0.983857 Test Accuracy = 0.879990
#clf2 = GradientBoostingClassifier(n_estimators=400, learning_rate=0.02,max_depth=3, random_state=0) #Train Accuracy = 0.978574 Test Accuracy = 0.882669

clf2 = GradientBoostingClassifier(
    n_estimators=800, learning_rate=0.4, max_depth=3,
    random_state=0)  #Train Accuracy = 0.979995 Test Accuracy = 0.877224

clf2.fit(Xtr, y_train)
accut = clf2.score(Xts, y_test)
accutr = clf2.score(Xtr, y_train)
print "-------------------------------------Train Accuracy = %f Test Accuracy = %f " % (
    accutr, accut)
Exemplo n.º 55
0
Arquivo: main.py Projeto: wsqat/DM
def gradient_boosting_classifier(train_x, train_y):
    from sklearn.ensemble import GradientBoostingClassifier
    model = GradientBoostingClassifier(n_estimators=200)
    model.fit(train_x, train_y)
    print model.feature_importances_  # 显示每一个特征的重要性指标,越大说明越重要
    return model
Exemplo n.º 56
0
    for j in corr_mat:
        if (i == j):
            continue

        else:
            if (corr_mat[i][j] > 0.2):
                a.add(i)
print(a)

sve = SVC()
sve.fit(data_pd, Y_train)
print(sve.score(data_pd1, Y_test))
print(sve.score(data_pd, Y_train))

grb = GradientBoostingClassifier()
grb.fit(data_pd, Y_train)
print(grb.score(data_pd1, Y_test))
print(grb.score(data_pd, Y_train))

cor_matt = data_pd.corr()
eig_vals, eig_vecs = np.linalg.eig(cor_matt)
#print(eig_vals)
#print('sdaddddddddddddddd')
#print(eig_vecs)
'''fiting and transforming pca'''
pca = PCA(n_components=9)
train_features = pca.fit_transform(data_pd)
test_features = pca.transform(data_pd1)

sve1 = SVC()
sve1.fit(train_features, Y_train)
Exemplo n.º 57
0
        rf.fit(X_train, y_train)
        rf_enc.fit(rf.apply(X_train))
        rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

        y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
        fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
        
        print('rf_lm')
        auc_rf_lm += auc(fpr_rf_lm, tpr_rf_lm)
        score_rf_lm += cal_score(fpr_rf_lm, tpr_rf_lm)
        print([cal_score(fpr_rf_lm, tpr_rf_lm), auc(fpr_rf_lm, tpr_rf_lm)])

        grd = GradientBoostingClassifier(n_estimators=n_estimator)
        grd_enc = OneHotEncoder()
        grd_lm = LogisticRegression()
        grd.fit(X_train, y_train)
        grd_enc.fit(grd.apply(X_train)[:, :, 0])
        grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

        y_pred_grd_lm = grd_lm.predict_proba(
           grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
        fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
        
        print('grd_lm')
        score_grd_lm += cal_score(fpr_grd_lm, tpr_grd_lm)
        auc_grd_lm += auc(fpr_grd_lm, tpr_grd_lm)
        print([cal_score(fpr_grd_lm, tpr_grd_lm), auc(fpr_grd_lm, tpr_grd_lm)])


        # The gradient boosted model by itself
        y_pred_grd = grd.predict_proba(X_test)[:, 1]
Exemplo n.º 58
0
#test_data = preprocessing.normalize(test_data, norm='l2')

#-----------------------------------------------------------------------------------
#clf1 = linear_model.LogisticRegression(random_state=1)
clf = GradientBoostingClassifier(
    n_estimators=2500,  #learning_rate=1.0,
    verbose=1,
    random_state=1).fit(train_data, train_label)
#clf3 = GaussianNB()
#eclf1 = VotingClassifier(estimators=[('lr', clf1), ('gb', clf2), ('gnb', clf3)], voting='hard')
#eclf1 = eclf1.fit(train_data, train_label)
#result = eclf1.predict_prob()
#-----------------------------------------------------------------------------------

#clf = AdaBoostClassifier(n_estimators=3000)
clf.fit(train_data, train_label)

#clf = BaggingClassifier(n_estimators = 2000)
#clf.fit(train_data, train_label)

#eclf2 = VotingClassifier(estimators=[('lr', clf1), ('gb', clf2), ('gnb', clf3)], voting='soft')

#clf  = RandomForestClassifier(n_estimators=6000, max_depth = 4, verbose=1).fit(train_data, train_label)
#knn = neighbors.KNeighborsClassifier()
#logistic = linear_model.LogisticRegression()
#clf = svm.SVC(probability = True)
#clf = tree.DecisionTreeClassifier()

#print('KNN score: %f' % knn.fit(train_data, train_label).score(valid_data, valid_label))
#result = knn.fit(train_data, train_label).predict_proba(test_data)
#train_data = train_data[0:5000,:]
Exemplo n.º 59
0
def gradient_boosting_classifier(X_train, y_train):
    from sklearn.ensemble import GradientBoostingClassifier
    model = GradientBoostingClassifier(n_estimators=200, random_state=2)
    model.fit(X_train, y_train)
    return model
Exemplo n.º 60
0
X_val = test[features_selected]
y_val = test['Label']

#Define lists to save results for later-plotting
score_ne_cv = np.zeros(20)
score_ne_self = np.zeros(20)

ams_ne_cv = np.zeros(20)
ams_ne_self = np.zeros(20)

n_estimators = range(10,210,10)

#Vary number of estimators from 10 to 200
for i in range(20):
    clf = set_classifier(n_estimators=(i+1)*10)
    clf.fit(X_train, y_train)
    result_self = clf.predict(X_train)
    s_self = (result_self == 's')
    b_self = (result_self == 'b')

    result_cv = clf.predict(X_val)
    s_cv = (result_cv == 's')
    b_cv = (result_cv == 'b')

    score_ne_self[i] = clf.score(X_train,y_train)
    score_ne_cv[i] = clf.score(X_val,y_val)

    ams_ne_self[i] = AMS(s_self.sum(), b_self.sum())
    ams_ne_cv[i] = AMS(s_cv.sum(), b_cv.sum())

#Plot socres as a function of the number of estimators