예제 #1
0
파일: train.py 프로젝트: DenXX/irlab
def train(targets, features, model_file, params):
    model = GradientBoostingRegressor(**params)
    print "Training hard..."
    model.fit(features, targets)
    print "Saving model..."
    pickle.dump(model, open(model_file, 'wb'))
    return model
    def train(self, x, y, param_names, **kwargs):
        start = time.time()
        scaled_x = self._set_and_preprocess(x=x, param_names=param_names)

        # Check that each input is between 0 and 1
        self._check_scaling(scaled_x=scaled_x)

        if self._debug:
            print "Shape of training data: ", scaled_x.shape
            print "Param names: ", self._used_param_names
            print "First training sample\n", scaled_x[0]
            print "Encode: ", self._encode

        # Do a random search
        max_features, learning_rate, max_depth, min_samples_leaf, n_estimators = self._random_search(random_iter=100,
                                                                                                     x=scaled_x, y=y)
        # Now train model
        gb = GradientBoostingRegressor(loss='ls',
                                       learning_rate=learning_rate,
                                       n_estimators=n_estimators,
                                       subsample=1.0,
                                       min_samples_split=2,
                                       min_samples_leaf=min_samples_leaf,
                                       max_depth=max_depth,
                                       init=None,
                                       random_state=self._rng,
                                       max_features=max_features,
                                       alpha=0.9,
                                       verbose=0)
        gb.fit(scaled_x, y)
        self._model = gb

        duration = time.time() - start
        self._training_finished = True
        return duration
예제 #3
0
    def cross_val_cols(self, n_folds = 3):
        """
        Takes in: number of folds
        
        Prints out RMSE score and stores the results in self.results
        """

        cv = KFold(n = self.X_train.shape[0], n_folds = n_folds)
        gbr = GradientBoostingRegressor(**self.params)
        self.med_error = []
        self.rmse_cv = []
        self.pct_error=[]
        self.results = {'pred': [],
                   'real': []}
        for train, test in cv:
            gbr.fit(self.X_train[train], self.y_train[train])
            dfFeatures+=[unencode(pd.DataFrame(columns=final_cols[:-1], data=self.X_train[test]))]
            pred = gbr.predict(self.X_train[test])
            medError=median_absolute_error(predExp, testExp)
            percentError=np.median([np.fabs(p-t)/t for p,t in zip(predExp, testExp)])
            error = mean_squared_error(np.power(pred, 10), np.power(self.y_train[test], 10))**0.5
            self.inFeatures=(self.X_train[test])
            self.results['pred'] += list(predExp)
            self.results['real'] += list(testExp)
            self.rmse_cv += [error]
            self.med_error+=[medError]
            self.pct_error+=[percentError]
        print 'Abs Median Error:', np.mean(self.med_error)
        print 'Abs Percent Error:', np.mean(self.pct_error)
        print 'Mean RMSE:', np.mean(self.rmse_cv)
        self.valDf=pd.DataFrame.concat(dfFeatures)
        self.valDf= self.valDf.reset_index().drop('index', axis = 1)
        self.valDf['pred']=self.results['pred']
        self.valDf['real']=self.results['real']
        return self.valDf
def train_model(features, label, params):
    #Preprocessing
    #scaled_features = preprocessing.scale(features);
    scaled_features  = features;

    total_rmse  = 0.0;
    count       = 0;

    kf          = KFold(len(scaled_features), n_folds=10);

    for train_index, validation_index in kf:

        X_train, X_validation = scaled_features[train_index], scaled_features[validation_index];
        Y_train, Y_validation = label[train_index], label[validation_index];

        #estimator               = SVR(**params)
        #estimator               = RandomForestRegressor(**params)
        estimator                = GradientBoostingRegressor(**params)

        estimator.fit(X_train, Y_train);

        current_rmse             = calculate_RMSE(estimator, X_validation, Y_validation);

        total_rmse              += current_rmse;
        count                   += 1;

    #Average across all samples
    avg_current_rmse   = total_rmse / float(count);
    print("Avg Current RMSE " + str(avg_current_rmse));

    return  (params, avg_current_rmse);
def check_boston(presort, loss, subsample):
    # Check consistency on dataset boston house prices with least squares
    # and least absolute deviation.
    ones = np.ones(len(boston.target))
    last_y_pred = None
    for sample_weight in None, ones, 2 * ones:
        clf = GradientBoostingRegressor(n_estimators=100,
                                        loss=loss,
                                        max_depth=4,
                                        subsample=subsample,
                                        min_samples_split=2,
                                        random_state=1,
                                        presort=presort)

        assert_raises(ValueError, clf.predict, boston.data)
        clf.fit(boston.data, boston.target,
                sample_weight=sample_weight)
        leaves = clf.apply(boston.data)
        assert_equal(leaves.shape, (506, 100))

        y_pred = clf.predict(boston.data)
        mse = mean_squared_error(boston.target, y_pred)
        assert_less(mse, 6.0)

        if last_y_pred is not None:
            assert_array_almost_equal(last_y_pred, y_pred)

        last_y_pred = y_pred
def test_feature_importance_regression():
    """Test that Gini importance is calculated correctly.

    This test follows the example from [1]_ (pg. 373).

    .. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements
       of statistical learning. New York: Springer series in statistics.
    """
    california = fetch_california_housing()
    X, y = california.data, california.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    reg = GradientBoostingRegressor(loss='huber', learning_rate=0.1,
                                    max_leaf_nodes=6, n_estimators=100,
                                    random_state=0)
    reg.fit(X_train, y_train)
    sorted_idx = np.argsort(reg.feature_importances_)[::-1]
    sorted_features = [california.feature_names[s] for s in sorted_idx]

    # The most important feature is the median income by far.
    assert sorted_features[0] == 'MedInc'

    # The three subsequent features are the following. Their relative ordering
    # might change a bit depending on the randomness of the trees and the
    # train / test split.
    assert set(sorted_features[1:4]) == {'Longitude', 'AveOccup', 'Latitude'}
def test_gradient_boosting_early_stopping():
    X, y = make_classification(n_samples=1000, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=1000,
                                     n_iter_no_change=10,
                                     learning_rate=0.1, max_depth=3,
                                     random_state=42)

    gbr = GradientBoostingRegressor(n_estimators=1000, n_iter_no_change=10,
                                    learning_rate=0.1, max_depth=3,
                                    random_state=42)

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=42)
    # Check if early_stopping works as expected
    for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 24), (gbr, 1e-1, 13),
                                              (gbc, 1e-3, 36),
                                              (gbr, 1e-3, 28)):
        est.set_params(tol=tol)
        est.fit(X_train, y_train)
        assert_equal(est.n_estimators_, early_stop_n_estimators)
        assert est.score(X_test, y_test) > 0.7

    # Without early stopping
    gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                     max_depth=3, random_state=42)
    gbc.fit(X, y)
    gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1,
                                    max_depth=3, random_state=42)
    gbr.fit(X, y)

    assert gbc.n_estimators_ == 100
    assert gbr.n_estimators_ == 200
def test_gradient_boosting_validation_fraction():
    X, y = make_classification(n_samples=1000, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=100,
                                     n_iter_no_change=10,
                                     validation_fraction=0.1,
                                     learning_rate=0.1, max_depth=3,
                                     random_state=42)
    gbc2 = clone(gbc).set_params(validation_fraction=0.3)
    gbc3 = clone(gbc).set_params(n_iter_no_change=20)

    gbr = GradientBoostingRegressor(n_estimators=100, n_iter_no_change=10,
                                    learning_rate=0.1, max_depth=3,
                                    validation_fraction=0.1,
                                    random_state=42)
    gbr2 = clone(gbr).set_params(validation_fraction=0.3)
    gbr3 = clone(gbr).set_params(n_iter_no_change=20)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    # Check if validation_fraction has an effect
    gbc.fit(X_train, y_train)
    gbc2.fit(X_train, y_train)
    assert gbc.n_estimators_ != gbc2.n_estimators_

    gbr.fit(X_train, y_train)
    gbr2.fit(X_train, y_train)
    assert gbr.n_estimators_ != gbr2.n_estimators_

    # Check if n_estimators_ increase monotonically with n_iter_no_change
    # Set validation
    gbc3.fit(X_train, y_train)
    gbr3.fit(X_train, y_train)
    assert gbr.n_estimators_ < gbr3.n_estimators_
    assert gbc.n_estimators_ < gbc3.n_estimators_
def test_plot_partial_dependence():
    # Test partial dependence plot function.
    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
    clf.fit(boston.data, boston.target)

    grid_resolution = 25
    fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)],
                                       grid_resolution=grid_resolution,
                                       feature_names=boston.feature_names)
    assert len(axs) == 3
    assert all(ax.has_data for ax in axs)

    # check with str features and array feature names
    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
                                                          ('CRIM', 'ZN')],
                                       grid_resolution=grid_resolution,
                                       feature_names=boston.feature_names)

    assert len(axs) == 3
    assert all(ax.has_data for ax in axs)

    # check with list feature_names
    feature_names = boston.feature_names.tolist()
    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
                                                          ('CRIM', 'ZN')],
                                       grid_resolution=grid_resolution,
                                       feature_names=feature_names)
    assert len(axs) == 3
    assert all(ax.has_data for ax in axs)
def gbm_fit(params, cv_folds):
    gbm = GradientBoostingRegressor(**params)
    gbm.fit(x_train, y_train)

    # Check accuracy of model
    # No need for validation data because of cross validation
    # Training data is split up into cv_folds folds:
    # Model trained on (cv_folds - 1) of the folds; last fold is saved as validation set
    cv_scores_mse = cross_validation.cross_val_score(gbm, x_train, y_train, cv=cv_folds, scoring='mean_squared_error')
    print '\nModel Report'
    print ('MSE Score: Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g' %
          (np.mean(cv_scores_mse), np.std(cv_scores_mse), np.min(cv_scores_mse), np.max(cv_scores_mse)))
    feat_imp = pd.Series(gbm.feature_importances_, features).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.show()

    # Check actual performance on test data
    final_predictions = gbm.predict(x_test)
    test['health_score_in_week'] = final_predictions
    test.to_csv(output_file, columns=['user_id', 'date', 'steps', 'total_sleep', 'resting_hr',
                                      'step_week_slope', 'sleep_week_slope', 'hr_week_slope',
                                      'curr_health_score', 'health_score_in_week'])

    # Save the model to file 'health_prediction.pkl'
    joblib.dump(gbm, 'health_prediction.pkl', compress=1)
예제 #11
0
def test_boston():
    # Check consistency on dataset boston house prices with least squares
    # and least absolute deviation.
    for loss in ("ls", "lad", "huber"):
        for subsample in (1.0, 0.5):
            last_y_pred = None
            for i, sample_weight in enumerate(
                    (None, np.ones(len(boston.target)),
                     2 * np.ones(len(boston.target)))):
                clf = GradientBoostingRegressor(n_estimators=100, loss=loss,
                                                max_depth=4, subsample=subsample,
                                                min_samples_split=1,
                                                random_state=1)

                assert_raises(ValueError, clf.predict, boston.data)
                clf.fit(boston.data, boston.target,
                        sample_weight=sample_weight)
                y_pred = clf.predict(boston.data)
                mse = mean_squared_error(boston.target, y_pred)
                assert mse < 6.0, "Failed with loss %s and " \
                    "mse = %.4f" % (loss, mse)

                if last_y_pred is not None:
                    np.testing.assert_array_almost_equal(
                        last_y_pred, y_pred,
                        err_msg='pred_%d doesnt match last pred_%d for loss %r and subsample %r. '
                        % (i, i - 1, loss, subsample))

                last_y_pred = y_pred
예제 #12
0
def gbdt_model(trains):

    trains = np.array(trains)

    gbdt=GradientBoostingRegressor(
      loss='ls',
      learning_rate=0.1,
      n_estimators=100,
      subsample=1,
      min_samples_split=2,
      min_samples_leaf=1,
      max_depth=3,
      init=None,
      random_state=None,
      max_features=None,
      alpha=0.9,
      verbose=0,
      max_leaf_nodes=None,
      warm_start=False
    )

#     pdb.set_trace()
    train_set = trains[:, :-1]
    label_set = trains[:, -1]

    gbdt.fit(train_set, label_set)
    return gbdt
def pipeline():
        val = data[data.watch==0]
        val_a_b = val[['item_id','store_code','a','b']]
        val_x = val.drop(['label','watch','item_id','store_code','a','b'],axis=1)

        train = data[data.watch!=0]
        train_y = train.label

        
        a = list(train.a)
        b = list(train.b)
        train_weight = []
        for i in range(len(a)):
            train_weight.append(min(a[i],b[i]))
        train_weight = np.array(train_weight)

        train_x = train.drop(['label','watch','item_id','store_code','a','b'],axis=1)

        train_x.fillna(train_x.median(),inplace=True)
        val_x.fillna(val_x.median(),inplace=True)
        

        model = GradientBoostingRegressor(loss='lad',learning_rate=0.01,n_estimators=400,subsample=0.75,max_depth=6,random_state=1024, max_features=0.75)

	#train
	model.fit(train_x,train_y, sample_weight=train_weight)


	#predict val set
	val_a_b['pred'] = model.predict(val_x)
        val_a_b.to_csv('gbrt_3.csv',index=None)
def gradient_boosting(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold):
	#kernel: linear, poly, rbf, sigmoid, precomputed

	rows = 0
	while rows_temp > 0:
		rows = rows + 1
		rows_temp = rows_temp - 1

	columns = 0
	while columns_temp > 0:
		columns = columns + 1
		columns_temp = columns_temp - 1

	features_values = [x for x in features_values_temp]
	prediction_values = [y for y in prediction_values_temp]



	rotated = convert_list_to_matrix(features_values, rows, columns)
	scores = np.array(prediction_values)

	threshold = float(threshold)

	estimator = SVR(kernel=kernel) # try to change to the model for which the test is gonna run (lasso, ridge, etc.)

	 X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
	 X_train, X_test = X[:200], X[200:]
	 y_train, y_test = y[:200], y[200:]
	 est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls').fit(X_train, y_train)
	 mean_squared_error(y_test, est.predict(X_test)) 
예제 #15
0
def grid_search():
    results_list_of_tuples = list()
    num_folds = 3
    best_result = tuple()
    for item1 in gd_grid['learning_rate']:
        for item2 in gd_grid['max_depth']:
            for item3 in gd_grid['min_samples_leaf']:
                for item4 in gd_grid['n_estimators']:
                    for item5 in gd_grid['random_state']:
                        instance =                           'LR {}, max_depth {}, min_samp_leaf {}, n_est {}, rs {}'.format(item1,                                                                                           item2, item3,                                                                                           item4, item5)
                        print instance
                        gbrt = GradientBoostingRegressor(random_state=item5,                                                          n_estimators=item4,                                                          min_samples_leaf=item3,                                                          max_depth=item2,                                                          learning_rate=item1                                                         )
                        kf = KFold(X.shape[0], n_folds=num_folds)
                        mse_list = []
                        for train_index, test_index in kf:
                            X_train, X_test = X[train_index], X[test_index]
                            y_train, y_test = y[train_index], y[test_index]
                            w_train, w_test = weights[train_index], weights[test_index]
                            gbrt.fit(X_train, y_train, w_train)                 
                            y_pred = gbrt.predict(X_test)
                            mse = mean_squared_error(y_test, y_pred, sample_weight=w_test)
                            mse_list.append(mse)

                        kf_mse = np.mean(np.array(mse_list))
                        results_list_of_tuples.append((instance, kf_mse))
                        
    return results_list_of_tuples
def GBRModel(X_train,X_cv,y_train,y_cv):
	targets = get_target_array()
	#print len(train_features)
	#print train_features[0]

	#print len(test_features)
	n_estimators = [50, 100]#, 1500, 5000]
	max_depth = [3,8]
	

	best_GBR = None
	best_mse = float('inf')
	best_score = -float('inf')

	print "################# Performing Gradient Boosting Regression ####################### \n\n\n\n"
	for estm in n_estimators:
		for cur_depth in max_depth:
			#random_forest = RandomForestRegressor(n_estimators=estm)
			regr_GBR = GradientBoostingRegressor(n_estimators=estm, max_depth= cur_depth)
			predictor = regr_GBR.fit(X_train,y_train)
			score = regr_GBR.score(X_cv,y_cv)
			mse = np.mean((regr_GBR.predict(X_cv) - y_cv) **2)
			print "Number of estimators used: ",estm
			print "Tree depth used: ",cur_depth
			print "Residual sum of squares: %.2f "%mse
			print "Variance score: %.2f \n"%score
			if best_score <= score:
				if best_mse > mse:
					best_mse = mse
					best_score = score
					best_GBR = predictor	
	print "\nBest score: ",best_score
	print "Best mse: ",best_mse
	return best_GBR
예제 #17
0
def gradient_boosting_regressor(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True):
    """
    :param train_x: train
    :param train_y: text
    :param pred_x: test set to predict
    :param review_id: takes in a review id
    :param v_curve: run the model for validation curve
    :param l_curve: run the model for learning curve
    :param get_model: run the model
    :return:the predicted values,learning curve, validation curve
    """
    gbr = GradientBoostingRegressor(n_estimators=200, max_depth=7, random_state=7)
    if get_model:
        print "Fitting GBR..."
        gbr.fit(train_x, np.log(train_y+1))
        gbr_pred = np.exp(gbr.predict(pred_x))- 1
        #dealing with
        for i in range(len(gbr_pred)):
            if gbr_pred[i] < 0:
                gbr_pred[i] = 0
        Votes = gbr_pred[:, np.newaxis]
        Id = np.array(review_id)[:, np.newaxis]
        submission_gbr = np.concatenate((Id,Votes),axis=1)
        np.savetxt("submission_gbr.csv", submission_gbr,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='')
    # plot validation and learning curves
    if v_curve:
        print "Working on Validation Curves"
        plot_validation_curve(GradientBoostingRegressor(), "Validation Curve: GBR", train_x, np.log(train_y+1.0),
                              param_name="n_estimators", param_range=[5, 20, 60, 100, 150, 200])
    if l_curve:
        print "Working on Learning Curves"
        plot_learning_curve(GradientBoostingRegressor(), "Learning Curve: GBR", train_x, np.log(train_y+1.0))
    def add_new_weak_learner(self):
        '''
        Summary:
            Adds a new function, h, to self.weak_learners by solving for Eq. 1 using multiple additive regression trees:

            [Eq. 1] h = argmin_h (sum_i Q_A(s_i,a_i) + h(s_i, a_i) - (r_i + max_b Q_A(s'_i, b)))

        '''
        if len(self.most_recent_episode) == 0:
            # If this episode contains no data, don't do anything.
            return

        # Build up data sets of features and loss terms
        data = np.zeros((len(self.most_recent_episode), self.max_state_features + 1))
        total_loss = np.zeros(len(self.most_recent_episode))

        for i, experience in enumerate(self.most_recent_episode):
            # Grab the experience.
            s, a, r, s_prime = experience

            # Pad in case the state features are too short (as in Atari sometimes).
            features = self._pad_features_with_zeros(s, a)
            loss = (r + self.gamma * self.get_max_q_value(s_prime) - self.get_q_value(s, a))
            
            # Add to relevant lists.
            data[i] = features
            total_loss[i] = loss

        # Compute new regressor and add it to the weak learners.
        estimator = GradientBoostingRegressor(loss='ls', n_estimators=1, max_depth=self.max_depth)
        estimator.fit(data, total_loss)
        self.weak_learners.append(estimator)
예제 #19
0
def get_boosting_regressor(x, y, verbose=False):
    """Calculate a GradientBoostingRegressor on predictor and target variables

    Parameters
    ----------
    x : numpy.array
        Predictor variable
    y : numpy.array
        Target variable
    verbose : bool, optional
        If True, output status messages

    Returns
    -------
    classifier : sklearn.ensemble.GradientBoostingRegressor
        A fitted classifier of the predictor and target variable
    """
    if verbose:
        sys.stderr.write('Getting boosting regressor\n')

    clf = GradientBoostingRegressor(n_estimators=50, subsample=0.6,
                                    max_features=100,
                                    verbose=0, learning_rate=0.1,
                                    random_state=0).fit(x, y)

    clf.feature_importances = pd.Series(clf.feature_importances_,
                                        index=x.columns)
    if verbose:
        sys.stderr.write('Finished boosting regressor\n')

    return clf
예제 #20
0
파일: anm.py 프로젝트: sibelius/CauseEffect
def anm_fit( (x, y) ):
  newX = np.array(x).reshape(len(x), 1)
  clf = GradientBoostingRegressor()
  clf.fit(newX, y)
  err = y - clf.predict(newX)
  ret =  [clf.score(newX, y)] + list(pearsonr(x, err))
  return ret
예제 #21
0
def train_and_score(i):
	global X_train
	global X_test 
	global Y_train
	global dist_train
	global dist_test
	
	# GBR performed best but we experimented with other models as well (see the paper)
	cl = GradientBoostingRegressor(n_estimators=100, loss='ls', learning_rate=0.1)

	# we add user distance from i-th branch (for which we do prediction) to train set
	dist_from_target_branch_train = dist_train[:,i].reshape((len(dist_train[:,i]),1))  # dist from i-th branch
	X_train = np.hstack((X_train, dist_from_target_branch_train))
	# we add mean user activity distance from i-th branch (for which we do prediction) to train set
	ab_dist_train = act_branch_dist_train[:,i].reshape((len(act_branch_dist_train[:,i]),1))  # dist from i-th branch
	X_train = np.hstack((X_train, ab_dist_train))

	# we also experimented with Standard Scaler, without much success
	# mmscaler_train = StandardScaler()
	# X_train = mmscaler_train.fit_transform(X_train)

	cl.fit(X_train,Y_train[:,i])

	# same as above for test set
	dist_from_target_branch_test = dist_test[:,i].reshape((len(dist_test[:,i]),1))  # dist from i-th branch
	X_test = np.hstack((X_test, dist_from_target_branch_test))
	ab_dist_test = act_branch_dist_test[:,i].reshape((len(act_branch_dist_test[:,i]),1))  # dist from i-th branch
	X_test = np.hstack((X_test, ab_dist_test))

	# mmscaler_test = StandardScaler()
	# X_test = mmscaler_test.fit_transform(X_test)

	return cl.predict(X_test)
예제 #22
0
파일: supra.py 프로젝트: TimSC/supra
class SupraAxis():
	def __init__(self, axisXIn = 1., axisYIn = 0.):
		self.reg = None
		self.x = axisXIn
		self.y = axisYIn
	
	def PrepareModel(self, features, offsets):
		if self.reg is not None:
			return 0

		self.reg = GradientBoostingRegressor()

		offsets = np.array(offsets)
		labels = offsets[:,0] * self.x + offsets[:,1] * self.y

		if not np.all(np.isfinite(labels)):
			raise Exception("Training labels contains non-finite value(s), either NaN or infinite")

		self.reg.fit(features, labels)

	def IsModelReady(self):
		return self.reg is not None

	def ClearModel(self):
		self.reg = None
	
	def GetFeatureImportance(self):
		return self.reg.feature_importances_
예제 #23
0
파일: ca_models.py 프로젝트: pkravik/kaggle
def CaGBMModel(X_train, Y_train, X_test, Y_test, cv_iterator):
    
    #===========================================================================
    # modelCV = GradientBoostingRegressor(subsample = 1, random_state = 42)
    # param_grid = {'loss':['ls'],
    #               'learning_rate':[0.1],
    #               'n_estimators':[100],
    #               'max_depth':[5, 50, 150],
    #               'min_samples_split':[2],
    #               'min_samples_leaf':[5, 15, 30],
    #               'max_features':["auto"]
    #               }
    # 
    # search = GridSearchCV(modelCV, param_grid, scoring="mean_squared_error", cv=cv_iterator, n_jobs = -1)
    # search.fit(X_train, Y_train["P"])
    # search.grid_scores_
    # model = search.best_estimator_
    # mse = search.best_score_
    # print (time.strftime("%H:%M:%S"))
    #===========================================================================
    gbm = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, max_depth=50, min_samples_leaf=20, max_features=None, random_state=76)
    gbm.fit(X_train, Y_train["Ca"])
    
    yhat_gbm = gbm.predict(X_test)
    test_error = math.sqrt(mean_squared_error(Y_test["Ca"], yhat_gbm))

    return gbm, test_error
예제 #24
0
    def build_models(self):

        self.remove_columns(
            [
                "institute_latitude",
                "institute_longitude",
                "institute_state",
                "institute_country",
                "var10",
                "var11",
                "var12",
                "var13",
                "var14",
                "var15",
                "instructor_past_performance",
                "instructor_association_industry_expert",
                "secondary_area",
                "var24",
            ]
        )

        model1 = GradientBoostingRegressor(learning_rate=0.1, n_estimators=200, subsample=0.8)
        model2 = RandomForestRegressor(n_estimators=50)
        model3 = ExtraTreesRegressor(n_estimators=50)

        model1.fit(self.X, self.y)
        model2.fit(self.X, self.y)
        model3.fit(self.X, self.y)

        return [model1, model2, model3]
예제 #25
0
def fit(filename, treename, inputsname, targetname, workingpoint=0.9, test=False):
    # Reading inputs and targets
    ninputs = len(inputsname)
    branches = copy.deepcopy(inputsname)
    branches.append(targetname)
    data = root2array(filename, treename=treename, branches=branches)
    data = data.view((np.float64, len(data.dtype.names)))
    # Extract and format inputs and targets from numpy array
    inputs = data[:, range(ninputs)].astype(np.float32)
    targets = data[:, [ninputs]].astype(np.float32).ravel()
    # if test requested, use 60% of events for training and 40% for testing
    inputs_train = inputs
    targets_train = targets
    if test:
        inputs_train, inputs_test, targets_train, targets_test = cross_validation.train_test_split(inputs, targets, test_size=0.4, random_state=0)
    # Define and fit quantile regression (quantile = workingpoint)
    # Default training parameters are used
    regressor = GradientBoostingRegressor(loss='quantile', alpha=workingpoint)
    regressor.fit(inputs_train, targets_train)
    if test:
        # Compare regression prediction with the true value and count the fraction of time it falls below
        # This should give the working point value
        predict_test = regressor.predict(inputs_test)
        compare = np.less(targets_test, predict_test)
        print 'Testing regression with inputs', inputsname, 'and working point', workingpoint
        print '    Test efficiency =', float(list(compare).count(True))/float(len(compare))
        # TODO: add 1D efficiency graphs vs input variables
    return regressor
예제 #26
0
    def fit(self,data_train,target):
        self.target_train = target
        self.catcol = data_train.filter(like='var').columns.tolist()
        #start_gbr_tr = time.clock()
        self.gbr = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7)
        self.gbr.fit(data_train,self.target_train)
        self.transformed_train_gbr = self.gbr.transform(data_train,threshold="0.35*mean")
        self.gbr_tr_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7)
        self.gbr_tr_fit.fit(self.transformed_train_gbr,self.target_train)
        #end_gbr_tr = time.clock()
        #print >> log, "time_gbr_tr = ", end_gbr_tr-start_gbr_tr

        #start_xfr_tr = time.clock()
        self.xfr= ExtraTreesRegressor(n_estimators =self.nest,max_depth=7)
        self.xfr.fit(data_train,self.target_train)
        self.transformed_train_xfr = self.xfr.transform(data_train,threshold="0.35*mean")
        self.xfr_tr_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7)
        self.xfr_tr_fit.fit(self.transformed_train_xfr,self.target_train)
        #end_xfr_tr = time.clock()
        #print >> log, "time_xfr_tr = ", end_xfr_tr-start_xfr_tr

        #start_gbr_cat = time.clock()
        self.gbr_cat_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7)
        self.gbr_cat_fit.fit(data_train[self.catcol],self.target_train)
        #end_gbr_cat = time.clock()
        #print >> log, "time_gbr_cat = ", end_gbr_cat-start_gbr_cat

        #start_xfr_cat = time.clock()
        self.xfr_cat_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7)
        self.xfr_cat_fit.fit(data_train[self.catcol],self.target_train)
        #end_xfr_cat = time.clock()
        #print >> log, "time_xfr_cat = ", end_xfr_cat-start_xfr_cat
        return self
예제 #27
0
def modelTheData(data,target):

#    params = {'n_estimators': 400, 'max_depth': 4, 'min_samples_split': 2,
#          'subsample': 0.5,'min_samples_leaf': 2,
#          'learning_rate': 0.01, 'loss': 'ls'}


#beijing
    myMachine = GradientBoostingRegressor(alpha=0.9, init=None, learn_rate=None,
             learning_rate=0.05, loss='ls', max_depth=1, max_features=None,
             min_samples_leaf=2, min_samples_split=2, n_estimators=300,
             random_state=None, subsample=0.5, verbose=0)

#shanghai
#    myMachine = GradientBoostingRegressor(alpha=0.9, init=None, learn_rate=None,
#             learning_rate=0.05, loss='ls', max_depth=3, max_features=None,
#             min_samples_leaf=2, min_samples_split=2, n_estimators=500,
#             random_state=None, subsample=0.5, verbose=0)





#    myMachine = GradientBoostingRegressor(**params)
    myMachine.fit(data,target)

    return myMachine
예제 #28
0
파일: ca_models.py 프로젝트: pkravik/kaggle
def testingGBM(X_train, Y_train, X_test, Y_test):
    params = {'verbose':2, 'n_estimators':100, 'max_depth':50, 'min_samples_leaf':20, 'learning_rate':0.1, 'loss':'ls', 'max_features':None}
    test_init = Ridge(alpha = 0.1, normalize = True, fit_intercept=True)
    gbm2 = GradientBoostingRegressor(**params)
    gbm2.fit(X_train, Y_train["Ca"])
    yhat_gbm = gbm2.predict(X_test)
    mean_squared_error(Y_test["Ca"], yhat_gbm)
    math.sqrt(mean_squared_error(Y_test["Ca"], yhat_gbm))
    
    test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
    
    for i, y_pred in enumerate(gbm2.staged_decision_function(X_test)):
        test_score[i]=mean_squared_error(Y_test["Ca"], y_pred)
    
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.title('Deviance')
    plt.plot(np.arange(params['n_estimators']) + 1, gbm2.train_score_, 'b-',
             label='Training Set Deviance')
    plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
             label='Test Set Deviance')
    
    plt.legend(loc='upper right')
    plt.xlabel('Boosting Iterations')
    plt.ylabel('Deviance')
    plt.show()
예제 #29
0
파일: predict.py 프로젝트: pthaike/comp
def gbdrtrain(x, y, pre_x):
	x, pre_x = datscater(x, pre_x)
	clf = GradientBoostingRegressor(n_estimators=740, min_samples_leaf = 0.8, min_samples_split = 40, learning_rate=0.1,max_depth=7, random_state=400, loss='huber').fit(x, y)
	# clf = GradientBoostingRegressor(n_estimators=200,max_leaf_nodes =20, learning_rate=0.1,max_depth=6, random_state=400, loss='ls').fit(x, y)

	pred = clf.predict(pre_x)
	return pred
def impute(df,imp_val,headers):
	if np.isnan(imp_val):
		imp_val = -500	
	log("imputing...",1)

	model = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
	data = np.array(df[headers].get_values())
	data[np.isnan(data)] = -500

	for col in range(0,len(headers)):
		#print "Working on column: "+str(col)
		##for the current column, remove rows where the current (row,column) value is not equal to zero
		##this way we are only training on data with non-zero target values
		reduced_data = data[np.logical_not(data[:,col] == imp_val)] #remove row if row,col_num value is zero
		target_set = reduced_data[:,col]
		training_set = np.delete(reduced_data,col,1)
		model.fit(training_set,target_set)
		row_num=0
		for row in data:
			remaining = np.delete(row,col,0)
			if data[row_num,col] == imp_val:
				data[row_num,col] = model.predict(remaining)
			row_num+=1
	cntr=0
	for h in headers:
		df[h] = data[:,cntr];cntr+=1
	return df
예제 #31
0
    df = df.drop('day', axis=1)
    df['is_Holiday'] = df['month'].apply(
        lambda x: 1 if x in ['Apr', 'May', 'Jun', 'Nov'] else 0)
    df = df.drop('month', axis=1)
    df = df.drop(['title', 'cast'], axis=1)
    df = pd.get_dummies(df, prefix='is')  #Quantify all is_ columns!!
    df['vote_average'] = df['vote_average'].fillna(df['vote_average'].mean())
    return df


X, Y = rgf.drop('revenue', axis=1), rgf['revenue']
X = regression_engineering(X)
train_X, test_X, train_Y, test_Y = train_test_split(
    X, Y, train_size=0.75,
    test_size=0.25)  #randomly separating training and test set
reg = GradientBoostingRegressor()
reg.fit(train_X, train_Y)  #Train regressor model
print('Regressor Score: ', reg.score(test_X, test_Y))

#Compare with dummy regressor!!
dummy = DummyRegressor()
dummy.fit(train_X, train_Y)
print('Dummy Regressor Score: ', dummy.score(test_X, test_Y))

sns.set_style('whitegrid')
plt.figure(figsize=(12, 14))
sns.barplot(x=reg.feature_importances_, y=X.columns)
plt.savefig('regressor.png')

#Classification: Predicting Movie Sucess
cls = movies_df[movies_df['return'].notnull()]
def test_zero_estimator_reg():
    # Test if ZeroEstimator works for regression.
    est = GradientBoostingRegressor(n_estimators=20,
                                    max_depth=1,
                                    random_state=1,
                                    init=ZeroEstimator())
    est.fit(boston.data, boston.target)
    y_pred = est.predict(boston.data)
    mse = mean_squared_error(boston.target, y_pred)
    assert_almost_equal(mse, 33.0, decimal=0)

    est = GradientBoostingRegressor(n_estimators=20,
                                    max_depth=1,
                                    random_state=1,
                                    init='zero')
    est.fit(boston.data, boston.target)
    y_pred = est.predict(boston.data)
    mse = mean_squared_error(boston.target, y_pred)
    assert_almost_equal(mse, 33.0, decimal=0)

    est = GradientBoostingRegressor(n_estimators=20,
                                    max_depth=1,
                                    random_state=1,
                                    init='foobar')
    assert_raises(ValueError, est.fit, boston.data, boston.target)
예제 #33
0
bg.score(X_test, y_test)

#Adaboosting
regr = AdaBoostRegressor()
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

#Decision
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

#gradientBoost
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
gb.score(X_train, y_train)
gb.score(X_test, y_test)

#KNN
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn = KNeighborsRegressor(algorithm='brute')
knn.fit(X_train, y_train)
knn.score(X_train, y_train)
knn.score(X_test, y_test)

#votingRegressor
from sklearn.ensemble import VotingRegressor
reg1 = GradientBoostingRegressor()
예제 #34
0
from encode import create_df_cate_to_numeric
from init import split_train_test
import pandas as pd
import numpy as np
import datetime
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor

train_df = pd.read_csv('./data/train.csv')
n_train_df = create_df_cate_to_numeric(train_df)
n_train_df.fillna(0, inplace=True)
X_train, X_test, y_train, y_test = split_train_test(n_train_df, test_size=0.01)

y_label = 'SalePrice'
df_y = n_train_df[y_label]
df_X = n_train_df.drop(y_label, axis=1)

grbt = GradientBoostingRegressor(n_estimators=1500, learning_rate=0.5)
grbt.fit(X_train, y_train)

test_df = pd.read_csv('./data/test.csv')
n_test_df = create_df_cate_to_numeric(test_df)
n_test_df.fillna(0, inplace=True)
y_pred = grbt.predict(n_test_df)
result = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': y_pred})
now = datetime.datetime.today().strftime('%Y-%m-%d_%H:%M')
file_name = './submissions/result' + now + '.csv'
result.to_csv(file_name, index=False)
예제 #35
0
                             lambda_2=1e-06,
                             n_iter=30,
                             normalize=False,
                             tol=0.0000001,
                             verbose=True)

    myGBR = GradientBoostingRegressor(alpha=0.9,
                                      criterion='friedman_mse',
                                      init=None,
                                      learning_rate=0.01,
                                      loss='huber',
                                      max_depth=14,
                                      max_features='sqrt',
                                      max_leaf_nodes=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=10,
                                      min_samples_split=40,
                                      min_weight_fraction_leaf=0.0,
                                      n_estimators=300,
                                      presort='auto',
                                      random_state=10,
                                      subsample=0.8,
                                      verbose=0,
                                      warm_start=False)

    RF_model = RandomForestRegressor(n_estimators=50,
                                     max_depth=25,
                                     min_samples_split=20,
                                     min_samples_leaf=10,
                                     max_features='sqrt',
예제 #36
0
    terms_to_sum = [(np.log(y_pred[i] + 1) - np.log(y[i] + 1))**2.0
                    for i, pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0 / len(y)))**0.5


if not os.path.exists("data.dat"):
    data = pd.read_csv("D:\userdata\\bellas\\Downloads\\train.csv", header=0)
    data.set_index(["ID"], inplace=True)
    data.to_pickle("data.dat")
else:
    data = pd.read_pickle("data.dat")

if not os.path.exists("model.dat"):
    X = data.drop(["target"], axis=1)
    targets = data.target
    gb = GradientBoostingRegressor()
    gb.fit(X, targets)
    with open("model.dat", "wb") as f:
        pickle.dump(gb, f)
else:
    with open("model.dat", "rb") as f:
        gb = pickle.load(f)

if not os.path.exists("test_data.dat"):
    test_data = pd.read_csv("D:\userdata\\bellas\\Downloads\\test.csv",
                            header=0)
    test_data.set_index(["ID"], inplace=True)
    test_data.to_pickle("test_data.dat")
else:
    test_data = pd.read_pickle("test_data.dat")
예제 #37
0
y = np.log1p(df['OBO']).values
X = df.iloc[:, 4:].values

# transformation
#rb = RobustScaler()
#X = rb.fit_transform(X)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1234)

# gb ==========================================================================
mod_gb = GradientBoostingRegressor(random_state=1337)
mod_gb.fit(X_train, y_train)

# Predicting the Test set results
y_pred = mod_gb.predict(X_test)
mape_gb = np.mean(
    np.abs((np.expm1(y_test) - np.expm1(y_pred)) / np.expm1(y_test))) * 100
#Print model report:
print("\GB nModel Report")
print("MAPE : %.2f" % mape_gb)


# hyperparameters tuning
def my_scorer(y_true, y_pred):
    mape = np.mean(
        np.abs((np.expm1(y_true) - np.expm1(y_pred)) / np.expm1(y_true))) * 100
예제 #38
0
def Regression(train_data, train_solution, test_data, test_solution, method):
    ## Fix Data Structure ##
    train_data = train_data.values
    train_solution = train_solution.values
    test_data = test_data.values
    test_solution = test_solution.values

    ## List of Method Options with Initialization ##
    if method == 'lin_reg':  # linear regression
        from sklearn.linear_model import LinearRegression
        reg = LinearRegression()
    elif method == 'ply_reg':  # polynomial regression
        from sklearn.linear_model import LinearRegression
        reg = LinearRegression()
        poly_features = PolynomialFeatures(degree=2)
    elif method == 'rdg_reg':  # ridge regression
        from sklearn.linear_model import Ridge
        reg = Ridge()
    elif method == 'lso_reg':  # lasso regression
        from sklearn.linear_model import Lasso
        reg = Lasso(alpha=0.00001)
    elif method == 'ela_net':  # elastic net regression
        from sklearn.linear_model import ElasticNet
        reg = ElasticNet()
    elif method == 'svr_lin':  # SVM regression
        from sklearn.svm import LinearSVR
        reg = LinearSVR(epsilon=0.01, max_iter=10000)
    elif method == 'svr_2nd':  # SVR regression
        from sklearn.svm import SVR
        reg = SVR(kernel='poly', degree=2, epsilon=0.01)  #C=100
    elif method == 'svr_3rd':  # SVR regression
        from sklearn.svm import SVR
        reg = SVR(kernel='poly', degree=3, epsilon=0.01)  #C=100
    elif method == 'dcn_tre':  # decision tree
        from sklearn.tree import DecisionTreeRegressor
        reg = DecisionTreeRegressor()
    elif method == 'rdm_for':  # random forests
        from sklearn.ensemble import RandomForestRegressor
        reg = RandomForestRegressor(n_estimators=100, random_state=3)
    elif method == 'ada_bst':  # AdaBoost Regressor
        from sklearn.ensemble import AdaBoostRegressor
        reg = AdaBoostRegressor(n_estimators=100, random_state=3)
    elif method == 'grd_bst':  # Gradient Boosting Regressor
        from sklearn.ensemble import GradientBoostingRegressor
        reg = GradientBoostingRegressor(random_state=3)
    elif method == 'gss_prc':  # Gaussian Process Regressor
        from sklearn.gaussian_process import GaussianProcessRegressor
        reg = GaussianProcessRegressor(random_state=3)
    elif method == 'knl_rdg':  # Kernel Ridge Regression
        from sklearn.kernel_ridge import KernelRidge
        reg = KernelRidge()
    elif method == 'nst_nbr_uni':  # K Nearest Neighbors Regressor
        from sklearn.neighbors import KNeighborsRegressor
        reg = KNeighborsRegressor(weights='uniform')
    elif method == 'nst_nbr_dst':  # K Nearest Neighbors Regressor
        from sklearn.neighbors import KNeighborsRegressor
        reg = KNeighborsRegressor(weights='distance')
    elif method == 'rad_nbr_uni':  # Radius Neighbor Regressor
        from sklearn.neighbors import RadiusNeighborsRegressor
        reg = RadiusNeighborsRegressor(weights='uniform')
    elif method == 'rad_nbr_dst':  # Radius Neighbor Regressor
        from sklearn.neighbors import RadiusNeighborsRegressor
        reg = RadiusNeighborsRegressor(weights='distance')
    elif method == 'mlp_reg':
        from sklearn.neural_network import MLPRegressor
        reg = MLPRegressor(random_state=3)
    else:
        print(
            'Error: Regression method not recognized.\nPlease pick a valid method key (example: xxx_xxx).'
        )

    ## Preprocessing and Setup ##
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    data = scaler.fit_transform(train_data)
    scaler = StandardScaler()
    test_data = scaler.fit_transform(test_data)
    solution = train_solution.reshape(-1, )
    if method == 'ply_reg':
        data = poly_features.fit_transform(data)
    reg.fit(data, solution)

    if len(test_data) < 5:
        predictions = reg.predict(data)

    elif len(test_data) > 5:
        if method == 'ply_reg':
            test_data = poly_features.transform(test_data)
        test_solution = test_solution.reshape(-1, )
        predictions_test = reg.predict(test_data)
        solution = test_solution
        predictions = predictions_test

    else:
        print('Error: test_set undetermined.')

    Matrix_to_save = pd.DataFrame()
    Matrix_to_save['Solution'] = solution
    Matrix_to_save['Predictions'] = predictions

    return Matrix_to_save
예제 #39
0
df = pd.read_csv('ml_house_data_set.csv')
del df['house_number']
del df['unit_number']
del df['street_name']
del df['zip_code']

features_df = pd.get_dummies(df, columns=['garage_type', 'city'])

del features_df['sale_price']

X = features_df.as_matrix()
y = df['sale_price'].as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

model = GradientBoostingRegressor(n_estimators=1000,
                                  learning_rate=0.1,
                                  max_depth=6,
                                  min_samples_leaf=9,
                                  max_features=0.1,
                                  loss='huber')

model.fit(X_train, y_train)

# joblib.dump(model, 'trained_house_classifier_model.pkl')

mse = mean_absolute_error(y_train, model.predict(X_train))
print 'Training set mean absolute error: %.4f' % mse
mse = mean_absolute_error(y_test, model.predict(X_test))
print 'Test set mean absolute error: %.4f' % mse
def test_regression_synthetic():
    # Test on synthetic regression datasets used in Leo Breiman,
    # `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996).
    random_state = check_random_state(1)
    regression_params = {
        'n_estimators': 100,
        'max_depth': 4,
        'min_samples_split': 2,
        'learning_rate': 0.1,
        'loss': 'ls'
    }

    # Friedman1
    X, y = datasets.make_friedman1(n_samples=1200,
                                   random_state=random_state,
                                   noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    for presort in True, False:
        clf = GradientBoostingRegressor(presort=presort)
        clf.fit(X_train, y_train)
        mse = mean_squared_error(y_test, clf.predict(X_test))
        assert_less(mse, 5.0)

    # Friedman2
    X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    for presort in True, False:
        regression_params['presort'] = presort
        clf = GradientBoostingRegressor(**regression_params)
        clf.fit(X_train, y_train)
        mse = mean_squared_error(y_test, clf.predict(X_test))
        assert_less(mse, 1700.0)

    # Friedman3
    X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    for presort in True, False:
        regression_params['presort'] = presort
        clf = GradientBoostingRegressor(**regression_params)
        clf.fit(X_train, y_train)
        mse = mean_squared_error(y_test, clf.predict(X_test))
        assert_less(mse, 0.015)
예제 #41
0
# 4
# ==============================================================================




train.index = range(train.shape[0])
test.index = range(test.shape[0])
pred.index = range(pred.shape[0])

train1 = test[(test.record_date >= '20160401') & (test.record_date < '20160501')]
train = pd.concat([train, train1])
test = test[(test.record_date >= '20160501') & (test.record_date < '20160701')]

featurelist = [i for i in train.columns if i not in df.columns]
gbdt = GradientBoostingRegressor(random_state=seed)
# gbdt = RandomForestRegressor(random_state = seed)
# gbdt = ExtraTreesRegressor(n_estimators=10,random_state = seed)
# gbdt = lgb.LGBMRegressor(max_depth = 2,learning_rate=0.05,n_estimators=3000,reg_alpha=10)
# gbdt = xgb.XGBRegressor(max_depth = 2,learning_rate=0.1,n_estimators=2000,reg_alpha=5,gamma = 10)
# gbdt = xgb.XGBRegressor(max_depth = 7,learning_rate=0.1,n_estimators=200,reg_alpha=5,gamma = 10)
gbdt = gbdt.fit(train[featurelist], train.power_consumption)
test['power_consumptionbk'] = test['power_consumption']
test.power_consumption = gbdt.predict(test[featurelist])
test.power_consumption = test.power_consumption.astype(int)

pred.power_consumption = gbdt.predict(pred[featurelist])

from matplotlib import pyplot as plt

plt.figure()
#############################################################################################################################################
# parameters : xgb regression ###############################################################################################################
#############################################################################################################################################

randomforest = RandomForestRegressor(n_estimators=10,
                                     max_depth=10,
                                     n_jobs=20,
                                     random_state=2017,
                                     max_features="auto",
                                     verbose=1)
adaboost = AdaBoostRegressor(n_estimators=500,
                             random_state=2017,
                             learning_rate=0.01)
gbdt = GradientBoostingRegressor(n_estimators=500,
                                 learning_rate=0.04,
                                 subsample=0.8,
                                 random_state=2017,
                                 max_depth=5,
                                 verbose=1)
extratree = ExtraTreesRegressor(n_estimators=600,
                                max_depth=8,
                                max_features="auto",
                                n_jobs=20,
                                random_state=2017,
                                verbose=1)
lr_reg = LinearRegression(n_jobs=20)
kNN = KNeighborsRegressor(n_neighbors=10,
                          n_jobs=20,
                          random_state=2017,
                          verbose=1)
#############################################################################################################################################
# parameters : regression ###################################################################################################################
예제 #43
0
def run(fold):
    df = pd.read_csv(config.AIRBNB_TRAIN_FOLDS_FILE).drop("id", axis=1)

    attribs = [f for f in df.columns if f not in ["price", "kfold"]]
    cat_attribs = ["zipcode",
                   "neighbourhood_cleansed",
                   "room_type",
                   "bed_type",
                   "cancellation_policy",
                   "security_deposit"]

    num_attribs = [f for f in attribs if f not in cat_attribs]

    df_train, df_valid = df[df.kfold != fold], df[df.kfold == fold]

    X_train, y_train = df_train[attribs].copy(), df_train["price"].copy()
    X_valid, y_valid = df_valid[attribs].copy(), df_valid["price"].copy()

    num_transformer = Pipeline([
        ("num_cleaner", NumAttributesCleaner(num_attribs))])

    cat_transformer = Pipeline([
        ("cat_clener", CatAttributesCleaner(cat_attribs)),
        ("ohe", OneHotEncoder())])

    transformer = ColumnTransformer([
        ("num", num_transformer, num_attribs),
        ("cat", cat_transformer, cat_attribs)])


    X_train = transformer.fit_transform(X_train)
    X_valid = transformer.transform(X_valid)

    #selector = SelectKBest(f_regression, 20)
    #rf = RandomForestRegressor(max_depth=2, random_state=42)
    #selector = SelectFromModel(estimator=rf, max_features=40)
    #X_train = selector.fit_transform(X_train, y_train)
    #X_valid = selector.transform(X_valid)



    model = GradientBoostingRegressor(n_estimators=200,
                                      learning_rate=0.1,
                                      max_depth=7,
                                      loss='ls',
                                      random_state=42)
    model.fit(X_train, y_train)

    train_preds = model.predict(X_train)
    valid_preds = model.predict(X_valid)

    train_scores = {"r2_score": metrics.r2_score(y_train, train_preds),
                    "rmse_score": np.sqrt(metrics.mean_squared_error(y_train, train_preds))}

    valid_scores = {"r2_score": metrics.r2_score(y_valid, valid_preds),
                    "rmse_score": np.sqrt(metrics.mean_squared_error(y_valid, valid_preds))}


    print(f"Fold={fold} (train): ",
          f"r2_score = {train_scores['r2_score'].round(2)}", "--- ",
          f"rmse_score = {train_scores['rmse_score'].round(2)}")

    print(f"Fold={fold} (valid): ",
          f"r2_score = {valid_scores['r2_score'].round(2)}", "--- ",
          f"rmse_score = {valid_scores['rmse_score'].round(2)}")
    print("")
    joblib.dump(model, os.path.join(config.MODEL_OUTPUT, f"gb_{fold}.bin"))
print('10-fold Cross Validation: ')
print("Tuned Ridge Parameter: {}".format(gs.best_params_))
print("Tuned Ridge R2: {}".format(r2))
print("Tuned Ridge MSE: {}".format(mse))

plt.plot(y_pred)
plt.plot(y_test_3)
plt.title('Ridge Regression Result on Test Set: PCT 9MO FWD')
plt.legend(['Predict', 'Real'])
plt.show()

#Ensembling by using GradientBoostingRegressor
import time
N = {'n_estimators': [50, 100, 200, 300, 400, 500, 600]}
from sklearn.ensemble import GradientBoostingRegressor
model_4 = GradientBoostingRegressor(max_depth=4, random_state=16)
gb = GridSearchCV(model_4, N, cv=10)
gb.fit(X_train_3, y_train_3)
y_pred = gb.predict(X_test_3)
r2 = gb.score(X_test_3, y_test_3)
mse = mean_squared_error(y_pred, y_test_3)

print('Ensemble by GradientBoostingRegressor: ')
print("GradientBoostingRegressor Parameter: {}".format(gb.best_params_))
print("GradientBoostingRegressor R2: {}".format(r2))
print("GradientBoostingRegressor MSE: {}".format(mse))

plt.plot(y_pred)
plt.plot(y_test_3)
plt.title('GradientBoostingRegressoe Result on Test Set: PCT 9MO FWD')
plt.legend(['Predict', 'Real'])
예제 #45
0
from sklearn.ensemble import GradientBoostingRegressor

with open('the_value_from_spark.txt', 'r') as f:
    the_value = int(f.read())

dt = {
    'predictor': {key + 1: key + 1
                  for key in range(the_value * 2)},
    'res': {key + 1: 0
            for key in range(the_value * 2)}
}

dt['res'][the_value] = the_value

df = pd.DataFrame(dt)

X_train = df.loc[:, ['predictor']]
y_train = df.loc[:, ['res']]
model = GradientBoostingRegressor(n_estimators=2000,
                                  learning_rate=0.1,
                                  loss='ls').fit(X_train, y_train)

X_calc = pd.DataFrame({'predictor': {0: the_value, 1: the_value - 10}})
Y_res = model.predict(X_calc)
res_df = pd.DataFrame(Y_res, columns=['res']).round()

res = int(res_df.iloc[0] - res_df.iloc[1])

with open('the_value_from_ml.txt', 'w') as f:
    f.write(str(res))
예제 #46
0
from math import sqrt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

from Python.basic.data_preparation import get_data_regression

if __name__ == '__main__':
    x_train, x_test, y_train, y_test = get_data_regression()

    params = {
        'n_estimators': 500,
        'max_depth': 4,
        'min_samples_split': 2,
        'learning_rate': 0.01,
        'loss': 'ls'
    }
    regressor = GradientBoostingRegressor(**params)
    regressor.fit(x_train, y_train)
    y_pred = regressor.predict(x_test)

    error_standard_deviation = sqrt(mean_squared_error(y_test, y_pred))
    print(error_standard_deviation)
예제 #47
0
regTreeModel=tree.DecisionTreeRegressor\
    (max_features=par[0],max_depth=par[1],min_samples_split=par[2],min_samples_leaf=par[3],
     min_weight_fraction_leaf=par[4],max_leaf_nodes=18)
fitModel = linear_model.LinearRegression()
Yp,Yptrain,regTreeModel,fitModelList,predind=SSRS.RegressionTree\
    (X_train,X_test,Y_train,Y_test,regTreeModel,fitModel,Field,doFitSelection=0)
rmse, rmse_band = SSRS.RMSEcal(Yp, Y_test)
rmse_train, rmse_band_train = SSRS.RMSEcal(Yptrain, Y_train)
print(rmse)
print(rmse_train)

# predict correlation
regModel=tree.DecisionTreeRegressor\
    (max_features=0.3,max_depth=20,min_samples_split=3,min_samples_leaf=5,
     min_weight_fraction_leaf=0.5)
regModel = GradientBoostingRegressor()
Yp,rmse,rmse_train,rmse_band,rmse_band_train=SSRS.Regression\
    (X_train,X_test,Y_train,Y_test,multiband=0,regModel=regModel,doplot=0)
print(rmse)
print(rmse_train)

regModel.fit(X_train, Y_train)
savedir = r"/Volumes/wrgroup/Kuai/USGSCorr/figure_tree/"
savedir = r"Y:\Kuai\USGSCorr\figure_tree\\"
with open(savedir + "tree.dot", 'w') as f:
    f = tree.export_graphviz(regModel,
                             out_file=f,
                             feature_names=Field,
                             label='none',
                             node_ids=True)
os.system("dot -Tpng tree.dot -o tree.png")
예제 #48
0
X = dataset[:,0:50]  # ni_n[47], na_n[1], V, T
y = dataset[:,50:51] # RD_mol[47], RD_at[1]
print(dataset.shape)
print(X.shape)
print(y.shape)


# Instantiate the machine learning models
SupportVectorMachine  = SVR()
KernelRidge           = KernelRidge()
MultiLayerPerceptron  = MLPRegressor()
KNeighbors            = KNeighborsRegressor()
ExtraTree             = ExtraTreesRegressor()
DecisionTree          = DecisionTreeRegressor()
RandomForest          = RandomForestRegressor()
GradientBoosting      = GradientBoostingRegressor()
HistGradientBoosting  = HistGradientBoostingRegressor()


# Instantiate the machine learning models with pipelines
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
SupportVectorMachine_pipe = Pipeline([('standardize', StandardScaler()), ('regressor', SupportVectorMachine)])
KernelRidge_pipe          = Pipeline([('standardize', StandardScaler()), ('regressor', KernelRidge)])
MultiLayerPerceptron_pipe = Pipeline([('standardize', StandardScaler()), ('regressor', MultiLayerPerceptron)])
KNeighbors_pipe           = Pipeline([('standardize', StandardScaler()), ('regressor', KNeighbors)])
ExtraTree_pipe            = Pipeline([('standardize', StandardScaler()), ('regressor', ExtraTree)])
DecisionTree_pipe         = Pipeline([('standardize', StandardScaler()), ('regressor', DecisionTree)])
RandomForest_pipe         = Pipeline([('standardize', StandardScaler()), ('regressor', RandomForest)])
GradientBoosting_pipe     = Pipeline([('standardize', StandardScaler()), ('regressor', GradientBoosting)])
HistGradientBoosting_pipe = Pipeline([('standardize', StandardScaler()), ('regressor', HistGradientBoosting)])
예제 #49
0
 def fit_gbr(self):
     self.model = GradientBoostingRegressor(n_estimators=self.n_estimators,
                                            max_depth=self.max_depth)
     self.model.fit(self.x, self.y)
예제 #50
0
def test_multi_target_regression_one_target():
    # Test multi target regression raises
    X, y = datasets.make_regression(n_targets=1)
    rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
    assert_raises(ValueError, rgr.fit, X, y)
예제 #51
0
dataset['relayNo'] = dataset['relayNo'].astype('category')
x = dataset.iloc[:, 4:6].values
y = dataset.iloc[:, 7:].values

print(dataset.dtypes)

# Split the dataset into the training set and test set
# We're splitting the data in 1/3, so out of 30 rows, 20 rows will go into the training set,
# and 10 rows will go into the testing set.
xTrain, xTest, yTrain, yTest = train_test_split(x,
                                                y,
                                                test_size=1 / 3,
                                                random_state=7)

linearRegressor = MultiOutputRegressor(GradientBoostingRegressor(), n_jobs=-1)

linearRegressor.fit(xTrain, yTrain)

yPrediction = linearRegressor.predict(xTest)

#print(mean_absolute_error(yTrain,yPrediction))

print(linearRegressor.predict([[2, 87], [18, 87], [5, 90], [4, 80]]))

plot.scatter(xTrain, yTrain, color='red')
plot.plot(xTrain, linearRegressor.predict(xTrain), color='blue')
plot.title('Tracking beacons')
plot.xlabel('RSSI and Relays')
plot.ylabel('Predicted Location')
plot.show()
예제 #52
0
    print('Calculating In-Bag RMSE')
    print(MSE(label, model.predict(train[col]))**0.5)
    print('Calculating Out-Bag RMSE')
    print(np.mean(RMSE))
    return Final,Final_pred
    
## Prepare output of level 1.

## Prepare data

train_,test_ = get_additional_features(train,test,magic=True)
train_ = train_.sample(frac=1,random_state=420)
col = list(test.columns)
## Input 1: GBDT

gb1 = GradientBoostingRegressor(n_estimators=1000,max_features=0.95,learning_rate=0.005,max_depth=4)
gb1_train,gb1_test = get_sklearn_stack_data(gb1,train_,col,train_['y'],test_)

## Input2: Lasso
las1 = Lasso(alpha=5,random_state=42)
las1_train,las1_test = get_sklearn_stack_data(las1,train_,col,train_['y'],test_)

## Input 3: LGB
params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting': 'gbdt',
            'learning_rate': 0.0045 , #small learn rate, large number of iterations
            'verbose': 0,
            'num_iterations': 500,
            'bagging_fraction': 0.95,
예제 #53
0

#**********************2.全样本3/12/24/36个月滑动窗口函数运行**********************************#
path = r'..\DataBase\factor'#96项因子所在路径
factorname = [x[1:-4] for x in os.listdir(path)]
riskfree, timeseries, factor, timeseries2, index = datatransfrom(path)[0], datatransfrom(path)[1], datatransfrom(path)[2], datatransfrom2(path)[0], datatransfrom2(path)[1]
for i in range(4):
    i= 0
    output(window[i],LinearRegression(),'OLS'+str(window[i]),riskfree[i], timeseries)
    FC(window[i], riskfree[i], timeseries, 96,'FC')
    output(window[i], PLSRegression(PLS_params[i]), 'PLS' + str(window[i]), riskfree[i], timeseries)
    output(window[i],Lasso(alpha=lasso_params[i]),'Lasso'+ str(window[i]), riskfree[i], timeseries)
    output(window[i],Ridge(alpha=ridge_params[i]),'Ridge'+str(window[i]),riskfree[i], timeseries)
    output(window[i],ElasticNet(alpha= elasticnet_params['alpha'] [i],l1_ratio= elasticnet_params['l1_ratio'][i]),'ElasticNet'+str(window[i]),riskfree[i], timeseries)
    output(window[i],SVR(kernel=SVR_params['kernel'][i],gamma= SVR_params ['gamma'][i],C= SVR_params ['C'][i] ),'SVR'+str(window[i]),riskfree[i], timeseries)
    output(window[i], GradientBoostingRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i],learning_rate=GBDT_params['learning_rate'][i]), 'GBDT' + str(window[i]),riskfree[i], timeseries)
    output(window[i], XGBRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i], learning_rate=GBDT_params['learning_rate'][i]), 'XGBOOST' + str(window[i]), riskfree[i], timeseries)
    output(window[i], ensemblenn(5,modeluse = MLPRegressor(solver = 'lbfgs', max_iter=ENANN_params['max_iter'][i]), pickpercent=ENANN_params['p'][i]), 'ENANN' + str(window[i]), riskfree[i], timeseries)
    output(window[i], DFN.DFN(outputdim=1, neuralset=[96, 50, 25, 10, 5, 2], ctx=gpu(0), epoch=10, batch_size=DFN_params['batch'][i], lr=DFN_params['learning_rate'][i]), 'DFN' + str(window[i]), riskfree[i], timeseries)
    output2(window[i], rm.lstmmodule(96, LSTM_params['hidden_number'][i], LSTM_params['depth'][i], 100, 3571, lr=LSTM_params['learning_rate'][i]), 'LSTM'+ str(window[i]) ,riskfree[i], timeseries2)
    output2(window[i], rm.lstmmodule(96,  RNN_params['hidden_number'][i], RNN_params['depth'][i], 100, 3571, lr=RNN_params['learning_rate'][i], ntype='RNN'), 'RNN'+ str(window[i]), riskfree[i], timeseries2)
    modellist = [DFN.DFN(outputdim=1, neuralset=[96, 50, 25, 10, 5, 2], ctx=gpu(0), epoch=10, batch_size=DFN_params['batch'][i], lr=DFN_params['learning_rate'][i]),
                 ensemblenn(5,modeluse = MLPRegressor(solver = 'lbfgs', max_iter=ENANN_params['max_iter'][i]), pickpercent=ENANN_params['p'][i]),
                 XGBRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i], learning_rate=GBDT_params['learning_rate'][i]),
                 GradientBoostingRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i],learning_rate=GBDT_params['learning_rate'][i]),
                 PLSRegression(PLS_params[i]),
                 Ridge(alpha=ridge_params[i]),
                 SVR(kernel=SVR_params['kernel'][i],gamma= SVR_params ['gamma'][i],C= SVR_params ['C'][i])]# PLS一定要放在倒数第三个
    nmolist = [rm.lstmmodule(96, LSTM_params['hidden_number'][i], LSTM_params['depth'][i], 100, 3571, lr=LSTM_params['learning_rate'][i]),
               rm.lstmmodule(96,  RNN_params['hidden_number'][i], RNN_params['depth'][i], 100, 3571, lr=RNN_params['learning_rate'][i], ntype='RNN')]# 循环神经网络模型
    modelname = ['DFN', 'En-ann', 'xgboost', 'GBDT', 'lasso', 'Elasticnet', 'pls', 'Ridge', 'svm', 'LSTM', 'RNN']
예제 #54
0
# 模型
# LASSO Regression :
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
# Elastic Net Regression
ENet = make_pipeline(
    RobustScaler(), ElasticNet(
        alpha=0.0005, l1_ratio=.9, random_state=3))
# Kernel Ridge Regression
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
# Gradient Boosting Regression
GBoost = GradientBoostingRegressor(
    n_estimators=3000,
    learning_rate=0.05,
    max_depth=4,
    max_features='sqrt',
    min_samples_leaf=15,
    min_samples_split=10,
    loss='huber',
    random_state=5)
#  XGboost
model_xgb = xgb.XGBRegressor(
    colsample_bytree=0.4603,
    gamma=0.0468,
    learning_rate=0.05,
    max_depth=3,
    min_child_weight=1.7817,
    n_estimators=2200,
    reg_alpha=0.4640,
    reg_lambda=0.8571,
    subsample=0.5213,
예제 #55
0
                                                  num_training_samples], weights_train[:
                                                                                       args
                                                                                       .
                                                                                       num_training_samples], y_train[:
                                                                                                                      args
                                                                                                                      .
                                                                                                                      num_training_samples]

    print('Training data shape: {}\nTesting data shape:{}'.format(
        str(X_train.shape), str(X_test.shape)))

    print('Training model ...')

    param_grid = json.loads(args.param_config.read())

    trees_grid = GridSearchCV(GradientBoostingRegressor(),
            param_grid = param_grid, n_jobs = args.cores, scoring = 'neg_mean_squared_error', cv = 10)\
                .fit(X_train, y_train, sample_weight = weights_train)

    model = trees_grid.best_estimator_

    print('Best MSE achieved: {}'.format(str(-1 * trees_grid.best_score_)))

    prediction = model.predict(X_test)

    score = mean_squared_error(y_test, prediction, sample_weight=weights_test)

    print('Validation set MSE: {}'.format(str(score)))
    print('Saving model ...')

    dump(model, args.out_model)
예제 #56
0
def _select_estimator(estimator, n_jobs, n_estimators, random_state=None):
    '''Select estimator and parameters from argument name.'''
    # Regressors
    if estimator == 'RandomForestRegressor':
        param_dist = {**parameters['ensemble'], **parameters['bootstrap']}
        estimator = RandomForestRegressor(
            n_jobs=n_jobs, n_estimators=n_estimators,
            random_state=random_state)
    elif estimator == 'ExtraTreesRegressor':
        param_dist = {**parameters['ensemble'], **parameters['bootstrap']}
        estimator = ExtraTreesRegressor(
            n_jobs=n_jobs, n_estimators=n_estimators,
            random_state=random_state)
    elif estimator == 'GradientBoostingRegressor':
        param_dist = parameters['ensemble']
        estimator = GradientBoostingRegressor(
            n_estimators=n_estimators, random_state=random_state)
    elif estimator == 'SVR':
        param_dist = {**parameters['svm'], 'epsilon': [0.0, 0.1]}
        estimator = SVR(kernel='rbf', gamma='scale')
    elif estimator == 'LinearSVR':
        param_dist = {**parameters['svm'], 'epsilon': [0.0, 0.1]}
        estimator = SVR(kernel='linear')
    elif estimator == 'Ridge':
        param_dist = parameters['linear']
        estimator = Ridge(solver='auto', random_state=random_state)
    elif estimator == 'Lasso':
        param_dist = parameters['linear']
        estimator = Lasso(random_state=random_state)
    elif estimator == 'ElasticNet':
        param_dist = parameters['linear']
        estimator = ElasticNet(random_state=random_state)
    elif estimator == 'KNeighborsRegressor':
        param_dist = parameters['kneighbors']
        estimator = KNeighborsRegressor(algorithm='auto')

    # Classifiers
    elif estimator == 'RandomForestClassifier':
        param_dist = {**parameters['ensemble'], **parameters['bootstrap'],
                      **parameters['criterion']}
        estimator = RandomForestClassifier(
            n_jobs=n_jobs, n_estimators=n_estimators,
            random_state=random_state)
    elif estimator == 'ExtraTreesClassifier':
        param_dist = {**parameters['ensemble'], **parameters['bootstrap'],
                      **parameters['criterion']}
        estimator = ExtraTreesClassifier(
            n_jobs=n_jobs, n_estimators=n_estimators,
            random_state=random_state)
    elif estimator == 'GradientBoostingClassifier':
        param_dist = parameters['ensemble']
        estimator = GradientBoostingClassifier(
            n_estimators=n_estimators, random_state=random_state)
    elif estimator == 'LinearSVC':
        param_dist = parameters['linear_svm']
        estimator = LinearSVC(random_state=random_state)
    elif estimator == 'SVC':
        param_dist = parameters['svm']
        estimator = SVC(kernel='rbf', random_state=random_state, gamma='scale')
    elif estimator == 'KNeighborsClassifier':
        param_dist = parameters['kneighbors']
        estimator = KNeighborsClassifier(algorithm='auto')

    return param_dist, estimator
def gradient_booster(param_grid, n_jobs):
    estimator = GradientBoostingRegressor()
    classifier = GridSearchCV(estimator=estimator, cv=5, param_grid=param_grid,
                              n_jobs=n_jobs)
    classifier.fit(X_train, y_train)
    print(classifier.best_estimator_)
 def gradient_boosting_regressor(self, train_x, train_y):
     from sklearn.ensemble import GradientBoostingRegressor
     model = GradientBoostingRegressor(n_estimators=100)
     model.fit(train_x, train_y)
     return model
    estimator = GradientBoostingRegressor()
    classifier = GridSearchCV(estimator=estimator, cv=5, param_grid=param_grid,
                              n_jobs=n_jobs)
    classifier.fit(X_train, y_train)
    print(classifier.best_estimator_)


gradient_booster(p1, job1)

# Train GBR with optimized parameters

clf = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                                learning_rate=0.05, loss='ls', max_depth=4,
                                max_features=1.0, max_leaf_nodes=None,
                                min_impurity_decrease=0.0, min_impurity_split=None,
                                min_samples_leaf=3, min_samples_split=2,
                                min_weight_fraction_leaf=0.0, n_estimators=100,
                                n_iter_no_change=None, presort='auto',
                                random_state=None, subsample=1.0, tol=0.0001,
                                validation_fraction=0.1, verbose=0, warm_start=False)

clf.fit(X_train, y_train)

# Predicting the results for our test data set
predicted_values = clf.predict(X_test)

print(f"Printing MAE error(avg abs residual): {metrics.mean_absolute_error(y_test, predicted_values)}")
print(f"Printing MSE error: {metrics.mean_squared_error(y_test, predicted_values)}")
print(f"Printing RMSE error: {np.sqrt(metrics.mean_squared_error(y_test, predicted_values))}")
print(f"R2 Score: {metrics.r2_score(y_test, predicted_values)}")
예제 #60
0
# RMSE
math.sqrt(mean_squared_error(target_test, predicted_tree_boost))
"""
    Gradient Boosting Regression ----------------------------------------------------

"""

# Tune Hyperparameters of DecisionTreeClassifier
parameters = {
    'max_depth': [2, 3, 4, 5, 10],
    'learning_rate': [0.001, 0.05, 0.1, 0.3, 0.8, 1, 1.5, 2, 4, 5],
    'n_estimators': range(2, 50, 5),
    'min_samples_leaf': [1, 2, 3, 5],
    'max_leaf_nodes': [5, 7, 10, 15]
}
grid_search_gradientboost = GridSearchCV(GradientBoostingRegressor(),
                                         parameters,
                                         n_jobs=4)

grid_search_gradientboost.fit(regressors_train_pca, target_train)
print(grid_search_gradientboost.best_score_,
      grid_search_gradientboost.best_params_)

# Train Best Model
regr_gradientboost = GradientBoostingRegressor(n_estimators=85,
                                               max_depth=5,
                                               min_samples_split=2,
                                               max_leaf_nodes=14,
                                               min_samples_leaf=4,
                                               learning_rate=0.15,
                                               loss='ls')