예제 #1
0
def ada():
    print("----------------------Ada----------------------------")
    t = DecisionTreeRegressor(max_depth=7, criterion='mse')
    ada = AdaBoostRegressor(base_estimator=t,
                            n_estimators=150,
                            random_state=seed)
    ada.fit(Xtrain, Ytrain)
    valiada = cross_val_score(ada, Xvalid, Yvalid, cv=kf, verbose=1, n_jobs=-1)
    test_score = ada.score(Xvalid, Yvalid)
    Y = ada.predict(Xvalid)
    print(test_score)
    print("MSE:", mean_squared_error(Yvalid, Y, squared=False))
def test_boston():
    # Check consistency on dataset boston house prices.
    reg = AdaBoostRegressor(random_state=0)
    reg.fit(boston.data, boston.target)
    score = reg.score(boston.data, boston.target)
    assert score > 0.85

    # Check we used multiple estimators
    assert len(reg.estimators_) > 1
    # Check for distinct random states (see issue #7408)
    assert_equal(len(set(est.random_state for est in reg.estimators_)),
                 len(reg.estimators_))
예제 #3
0
def test_boston():
    # Check consistency on dataset boston house prices.
    reg = AdaBoostRegressor(random_state=0)
    reg.fit(boston.data, boston.target)
    score = reg.score(boston.data, boston.target)
    assert score > 0.85

    # Check we used multiple estimators
    assert len(reg.estimators_) > 1
    # Check for distinct random states (see issue #7408)
    assert_equal(len(set(est.random_state for est in reg.estimators_)),
                 len(reg.estimators_))
예제 #4
0
def test_diabetes(loss):
    # Check consistency on dataset diabetes.
    reg = AdaBoostRegressor(loss=loss, random_state=0)
    reg.fit(diabetes.data, diabetes.target)
    score = reg.score(diabetes.data, diabetes.target)
    assert score > 0.6

    # Check we used multiple estimators
    assert len(reg.estimators_) > 1
    # Check for distinct random states (see issue #7408)
    assert (len(set(est.random_state
                    for est in reg.estimators_)) == len(reg.estimators_))
예제 #5
0
class _AdaBoostRegressorImpl:
    def __init__(
        self,
        base_estimator=None,
        *,
        n_estimators=50,
        learning_rate=1.0,
        loss="linear",
        random_state=None,
    ):
        if base_estimator is None:
            estimator_impl = None
        else:
            estimator_impl = _FitSpecProxy(base_estimator)

        self._hyperparams = {
            "base_estimator": estimator_impl,
            "n_estimators": n_estimators,
            "learning_rate": learning_rate,
            "loss": loss,
            "random_state": random_state,
        }
        self._wrapped_model = SKLModel(**self._hyperparams)
        self._hyperparams["base_estimator"] = base_estimator

    def get_params(self, deep=True):
        out = self._wrapped_model.get_params(deep=deep)
        # we want to return the lale operator, not the underlying impl
        out["base_estimator"] = self._hyperparams["base_estimator"]
        return out

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            feature_transformer = FunctionTransformer(
                func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns),
                inverse_func=None,
                check_inverse=False,
            )
            self._hyperparams["base_estimator"] = _FitSpecProxy(
                feature_transformer >> self._hyperparams["base_estimator"])
            self._wrapped_model = SKLModel(**self._hyperparams)
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def score(self, X, y, sample_weight=None):
        return self._wrapped_model.score(X, y, sample_weight)
def DecisionTreeAdaBoost(X_train, y_train, X_test, y_test):
    # Create Decision Tree Regressor object
    tree_1 = DecisionTreeRegressor()
    tree_2 = AdaBoostRegressor(DecisionTreeRegressor(),
                               n_estimators=200,
                               learning_rate=.1)
    # Train the model using the training sets
    tree_1.fit(X_train, y_train)
    tree_2.fit(X_train, y_train)
    # Score the decision tree model
    tree_1.score(X_test, y_test)
    # Score the boosted decision tree model
    boosted_tree_score = tree_2.score(X_test, y_test)
    boosted_tree_score
    # Make predictions using the testing set
    tree_1_pred = tree_1.predict(X_test)
    tree_2_pred = tree_2.predict(X_test)
    # The mean squared error
    tree2RMSE = sqrt(mean_squared_error(y_test, tree_2_pred))
    print("Root mean squared error: %.2f" % tree2RMSE)
    # The absolute squared error
    print("Mean absolute error: %.2f" %
          mean_absolute_error(y_test, tree_2_pred))
    # Explained variance score: 1 is perfect prediction
    print('R-squared decision tree: %.2f' % r2_score(y_test, tree_2_pred))
    features = X.columns
    importances = tree_2.feature_importances_
    indices = np.argsort(importances)

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             importances[indices],
             color='b',
             align='center')
    plt.yticks(range(len(indices)), features[indices])
    plt.xlabel('Relative Importance')
    plt.show()
    plt.scatter(y_test, tree_1_pred)
    plt.xlabel('Measured')
    plt.ylabel('Predicted')
    plt.title('Decision Tree Predicted vs Actual')
    plt.show()
    chart_regression(tree_1_pred, y_test, 'Decision tree')

    plt.scatter(y_test, tree_2_pred)
    plt.xlabel('Measured')
    plt.ylabel('Predicted')
    plt.title('Boosted Decision Tree Predicted vs Actual')
    plt.show()
    chart_regression(tree_2_pred, y_test, 'Adaboost + DT')

    return boosted_tree_score, tree2RMSE
예제 #7
0
    def ApplyAdaBoostRegressor(self, train, test, cross_validation, full_train,
                               config):
        ABR = AdaBoostRegressor(loss=config['loss'],
                                n_estimators=config['n_estimators'])
        target_train = train[['Hazard']]
        cross_validation_test = cross_validation[['Hazard']]
        prepared_train = train[train.columns.difference(['Id', 'Hazard'])]

        print "prepared_train meta"
        print "shape", prepared_train.shape
        print prepared_train.head(3)

        ABR.fit(prepared_train, target_train)
        dt = ABR.predict(test[test.columns.difference(['Id'])])
        print "prediction score on cross validation"
        print ABR.score(
            cross_validation[cross_validation.columns.difference(
                ['Id', 'Hazard'])], cross_validation_test)
        dt_cv = ABR.predict(
            cross_validation[cross_validation.columns.difference(
                ['Id', 'Hazard'])])
        test['Hazard'] = self.clipForecastValue(dt)
        cross_validation['predicted_Hazard'] = self.clipForecastValue(dt_cv)

        names = prepared_train.columns.values

        print "sorted feature importance"
        print sorted(zip(map(lambda x: round(x, 4), ABR.feature_importances_),
                         names),
                     reverse=True)

        #computing the Gini score
        print "the Gini score"
        print self.gini_normalized(
            np.ravel(cross_validation[['Hazard']]),
            np.ravel(cross_validation[['predicted_Hazard']]))

        return test, cross_validation
def adaboost():
    # train = genfromtxt(open('./data/PCA_train_scored.csv', 'r'), delimiter=',', dtype='f8')[1:]
    # house_prices = genfromtxt(open('./data/train_scored_y.csv', 'r'), delimiter=',', dtype='f8')[1:]
    # test_data = genfromtxt(open('./data/PCA_test_scored.csv', 'r'), delimiter=',', dtype='f8')[1:]

    train = genfromtxt(open(
        './data/feature_engineering_test/filtered_train_new_scored.csv', 'r'),
                       delimiter=',',
                       dtype='f8')[1:]
    house_prices = genfromtxt(open('./data/train_scored_y.csv', 'r'),
                              delimiter=',',
                              dtype='f8')[1:]
    test = genfromtxt(open(
        './data/feature_engineering_test/filtered_test_new_scored.csv', 'r'),
                      delimiter=',',
                      dtype='f8')[1:]

    # train_data = genfromtxt(open('./data/feature_engineering_test/PCA_train_new_scored.csv', 'r'), delimiter=',', dtype='f8')[1:1320,1:]
    # house_prices_data = genfromtxt(open('./data/train_scored_y.csv', 'r'), delimiter=',', dtype='f8')[1:]
    # test_data = genfromtxt(open('./data/feature_engineering_test/PCA_test_new_scored.csv', 'r'), delimiter=',', dtype='f8')[1:,1:]

    totalCols = 100

    train_data = train[:1320, 1:]
    house_prices_data = house_prices[:1320]

    validation_data = train[1320:, 1:]
    house_prices_validation = house_prices[1320:]

    test_data = test[0:, 1:]

    # Fit regression model
    regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=12),
                               n_estimators=500,
                               loss='square',
                               learning_rate=1)

    regr_2.fit(train_data, house_prices_data)

    #Predict validation
    y_validation = regr_2.predict(validation_data)
    mse = mean_squared_error(house_prices_validation, y_validation)
    print("AdaBoost MSE: %.4f" % mse)
    print("AdaBoost Variance: %.4f" %
          regr_2.score(validation_data, y_validation))

    # Predict
    y_2 = regr_2.predict(test_data)

    writeOutput(y_2, 'AdaBoost')
예제 #9
0
def adaboost_regression():
    '''
    runs an adaboost regression over the data set
    runs one with just the cleaned data

    we are already certain that cleaned data significantly outperforms
    raw data so we will not waste anymore time training models with raw data

    warning: this takes a REALLY long time to run, would not reccomend running
        this, especially because the results are not amazing
    '''
    # adaboost parameters
    kFold = 5
    param_grid = {
        'loss': np.array(['linear', 'square', 'exponential']),
        'learning_rate': np.arange(1, 101, 5) / 100,
        'n_estimators': np.arange(40, 400, 20)
    }
    adaboost_grid = GridSearchCV(AdaBoostRegressor(), param_grid, cv=kFold)

    # test using the cleaned data
    x_np, y_np, df = load_data()
    y_np_c, x_np_c, df_c = clean_data(df)
    x_train, x_test, y_train, y_test = split_data(y_np_c, x_np_c)
    adaboost_grid.fit(x_train, y_train)
    best_learn = adaboost_grid.best_params_['learning_rate']
    best_loss = adaboost_grid.best_params_['loss']
    best_n = adaboost_grid.best_params_['n_estimators']

    print("Best learning rate: %f" % best_learn)
    print("Best loss function: %s" % best_loss)
    print("Best n estimators: %f" % best_n)

    # train a model using these best parameters
    adaboost_model = AdaBoostRegressor(n_estimators=best_n,
                                       learning_rate=best_learn,
                                       loss=best_loss)
    adaboost_model.fit(x_train, y_train)

    y_predict = adaboost_model.predict(x_test)
    mse = mean_squared_error(y_predict, y_test)
    r2 = adaboost_model.score(x_test, y_test)

    print(
        "Performance of adaboost regression with removed day labels and normalized"
    )
    print("Mean Squared Error:  %f" % mse)
    print("RMSE:                %f" % (mse**0.5))
    print("R^2:                 %f" % r2)
예제 #10
0
파일: models.py 프로젝트: maxxliu/ML-Booth
def adaboost_regression():
    '''
    runs an adaboost regression over the data set
    warning: this will probably take a long time to run
    '''
    print("Currently running AdaBoost Regression")
    train_x, train_y, _ = load_data()
    # convert to np array
    train_x = train_x.values
    train_y = train_y.values
    # convert the y values to log
    train_y = log_transform(train_y, "forward")
    # split the data
    x_train, x_test, y_train, y_test = split_data(train_x, train_y)

    # adaboost parameters
    kFold = 5
    param_grid = {
        'loss': np.array(['linear', 'square', 'exponential']),
        'learning_rate': np.arange(1, 101, 5) / 100,
        'n_estimators': np.arange(40, 400, 20)
    }
    adaboost_grid = GridSearchCV(AdaBoostRegressor(), param_grid, cv=kFold)

    # test using the training data
    adaboost_grid.fit(x_train, y_train)
    best_learn = adaboost_grid.best_params_['learning_rate']
    best_loss = adaboost_grid.best_params_['loss']
    best_n = adaboost_grid.best_params_['n_estimators']

    print("Best learning rate:  %f" % best_learn)
    print("Best loss function:  %s" % best_loss)
    print("Best n estimators:   %f" % best_n)

    # train a model using these best parameters
    adaboost_model = AdaBoostRegressor(n_estimators=best_n,
                                       learning_rate=best_learn,
                                       loss=best_loss)
    adaboost_model.fit(x_train, y_train)

    y_predict = adaboost_model.predict(x_test)
    mse = mean_squared_error(y_predict, y_test)
    r2 = adaboost_model.score(x_test, y_test)

    print("Performance of adaboost regression")
    print("Mean Squared Error:  %f" % mse)
    print("RMSE:                %f" % (mse**0.5))
    print("R^2:                 %f" % r2)
예제 #11
0
class _AdaBoostRegressorImpl:
    def __init__(
        self,
        base_estimator=None,
        n_estimators=50,
        learning_rate=1.0,
        loss="linear",
        random_state=None,
    ):
        estimator_impl = base_estimator
        if isinstance(estimator_impl, lale.operators.Operator):
            if isinstance(estimator_impl, lale.operators.IndividualOp):
                estimator_impl = estimator_impl._impl_instance()
                wrapped_model = getattr(estimator_impl, "_wrapped_model", None)
                if wrapped_model is not None:
                    estimator_impl = wrapped_model
            else:
                raise ValueError(
                    "If base_estimator is a Lale operator, it needs to be an individual operator. "
                )
        self._hyperparams = {
            "base_estimator": estimator_impl,
            "n_estimators": n_estimators,
            "learning_rate": learning_rate,
            "loss": loss,
            "random_state": random_state,
        }
        self._wrapped_model = SKLModel(**self._hyperparams)
        self._hyperparams["base_estimator"] = base_estimator

    def get_params(self, deep=True):
        out = self._wrapped_model.get_params(deep=deep)
        # we want to return the lale operator, not the underlying impl
        out["base_estimator"] = self._hyperparams["base_estimator"]
        return out

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def score(self, X, y, sample_weight=None):
        return self._wrapped_model.score(X, y, sample_weight)
예제 #12
0
def generate_model(X_train, X_test, y_train, y_test):
  
    model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=100),
                          n_estimators=200, 
                          learning_rate=0.01
                         )
    '''
    model=GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
    max_depth=1, random_state=0, loss='ls')
   
    model = ExtraTreesRegressor(
                          n_estimators=200
                         )
    '''
    model.fit(X_train,y_train)
    print("model score ", model.score(X_test, y_test))
    return model
def run_tree_regressor():
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.cross_validation import cross_val_score
    from sklearn.cross_validation import train_test_split
    import numpy as np
    from sklearn.ensemble import AdaBoostRegressor
 
    print "running me"
    X = np.genfromtxt("/home/john/Downloads/kaggle.X1.train.txt",delimiter=",") # load the text file
    Y = np.genfromtxt("/home/john/Downloads/kaggle.Y.train.txt",delimiter=",") 
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)
     
    rng = np.random.RandomState(1)
 
    depth = 35 # current lowest
    for estimators in [130,235,300,345,450]:
        treeAdaBoost =  AdaBoostRegressor(DecisionTreeRegressor(max_depth=depth),n_estimators=estimators, random_state=rng)
        treeAdaBoost.fit(x_train, y_train)
        print "adabost estimators @ " + str(estimators) + ":", treeAdaBoost.score(x_test, y_test)
예제 #14
0
def Adaboost(Xtrain, Ytrain, Xtest, Ytest):
    """
	Apply the adaboost algorithm
	"""
    from sklearn.ensemble import AdaBoostRegressor
    print('\nAdaboost:')

    clf = AdaBoostRegressor(n_estimators=1000).fit(Xtrain, Ytrain)
    print('Accuracy: {0}'.format(clf.score(Xtrain, Ytrain)))

    #find the training error
    prediction = clf.predict(Xtrain)
    Etrain = error(prediction, Ytrain)
    print('Training error: {0}'.format(Etrain))

    #find the test error
    prediction = clf.predict(Xtest)
    Etrain = error(prediction, Ytest)
    print('Test error: {0}'.format(Etrain))
예제 #15
0
def boosting(X, y, k_cv):
    kfold = KFold(n_splits=k_cv, shuffle=True, random_state=0)
    regr = AdaBoostRegressor(base_estimator=SVR(C=40, gamma=0.01),
                             random_state=319,
                             n_estimators=40,
                             learning_rate=0.01,
                             loss="square")
    vaild_split = kfold.split(y)
    for i in range(k_cv):
        split_index = vaild_split.__next__()
        test_index = split_index[1]
        y_test = y[test_index]
        trainval_index = split_index[0]
        X_trainval = X[trainval_index, :]
        X_test = X[test_index, :]
        y_trainval = y[trainval_index]
        regr.fit(X_trainval, y_trainval)
        print((regr.score(X_trainval, y_trainval))**0.5)
        test_pre = regr.predict(X_test)
        print("accuracy: ", (r_2(y_test, test_pre))**0.5)
def makeAdaDefaultBaseEstimatorPrediction(n_est):
    global y_t_pred, result
    print "Prediction and #estimators = %s" % (n_est)
    prefix = "%s_AdaBoost_n_est%s_DefaultDecisionTree" % (name, n_est)
    model = AdaBoostRegressor(n_estimators=n_est)
    x1 = x[:, :]  # use all data
    x_t1 = x_t[:, :]  # use all data
    y_t_pred = model.fit(x1, y).predict(x_t1)
    r = model.score(x1, y)
    print("score r = %s" % r)
    print "Estimator weights: %s..." % model.estimator_weights_
    bla1 = (sorted(enumerate(model.estimator_weights_),
                   key=lambda x: -abs(x[1]))[:5])
    print "Abs-Val largest est-weights: %s..." % bla1
    plt.clf()
    plt.plot(model.estimator_weights_, "ro")
    plt.title("Most relevant coef:%s" % (bla1))
    plt.savefig(prefix + "_est_weights.png")
    plt.show()
    return prefix, model
def makeAdaLassoPrediction(al, n_est):
    global y_t_pred, result, alpha
    alpha = al
    print "Prediction with alpha = %s and #estimators = %s" % (alpha, n_est)
    prefix = "%s_AdaBoost_Lasso_alpha%s" % (name, alpha)
    model = AdaBoostRegressor(Lasso(alpha=alpha), n_estimators=n_est)
    x1 = x[:, :]  # use all data
    x_t1 = x_t[:, :]  # use all data
    y_t_pred = model.fit(x1, y).predict(x_t1)
    r = model.score(x1, y)
    print("score r = %s" % r)
    print "Estimator weights: %s..." % model.estimator_weights_
    bla1 = (sorted(enumerate(model.estimator_weights_),
                   key=lambda x: -abs(x[1]))[:5])
    print "Abs-Val largest est-weights: %s..." % bla1
    plt.clf()
    plt.plot(model.estimator_weights_, "ro")
    plt.title("Most relevant coef:%s" % (bla1))
    plt.savefig(prefix + "_est_weights.png")
    plt.show()
    return prefix, model
예제 #18
0
class _AdaBoostRegressorImpl:
    def __init__(
        self,
        base_estimator=None,
        n_estimators=50,
        learning_rate=1.0,
        loss="linear",
        random_state=None,
    ):
        if isinstance(base_estimator, lale.operators.Operator):
            if isinstance(base_estimator, lale.operators.IndividualOp):
                base_estimator = base_estimator._impl_instance()
                wrapped_model = getattr(base_estimator, "_wrapped_model", None)
                if wrapped_model is not None:
                    base_estimator = wrapped_model
            else:
                raise ValueError(
                    "If base_estimator is a Lale operator, it needs to be an individual operator. "
                )
        self._hyperparams = {
            "base_estimator": base_estimator,
            "n_estimators": n_estimators,
            "learning_rate": learning_rate,
            "loss": loss,
            "random_state": random_state,
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def score(self, X, y, sample_weight=None):
        return self._wrapped_model.score(X, y, sample_weight)
class ABRegressor():
    def __init__(self, dataset):
        self.dataset = dataset
        self.adaboost = AdaBoostRegressor(
            **DEFAULTS[dataset]['ab']['defaults'])
        print("""
    		*************************
    		    Ada Boost Regressor
    		************************ 
    	""")

    def train_and_predict(self, X, y, X_test):
        '''
        fit training dataset and predict values for test dataset
        '''
        self.adaboost.fit(X, y)
        self.adaboost.predict(X_test)

    def score(self, X, X_test, y, y_test):
        '''
        Returns the score of Ada Boost by fitting training data
        '''
        self.train_and_predict(X, y, X_test)
        return self.adaboost.score(X_test, y_test)

    def create_new_instance(self, values):
        return AdaBoostRegressor(**{**values})

    def param_grid(self, is_random=False):
        '''
        dictionary of hyper-parameters to get good values for each one of them
        '''
        # random search only accepts a dict for params whereas gridsearch can take either a dic or list of dict
        return DEFAULTS[self.dataset]['ab']['param_grid']

    def get_sklearn_model_class(self):
        return self.adaboost

    def __str__(self):
        return "AdaBoostRegressor"
예제 #20
0
def run_tree_models(x,y):
    '''
    Get an overview of performances of different tree models.
    Tree models: Decision tree, AdaBoost, Bagged tree
    INPUT: Dataframe with features (X) and target variable dataframe (y)
    OUTPUT: Scores of each tree model
    '''
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    dt = DecisionTreeRegressor()
    dt.fit(X_train, y_train)
    print('Decision Tree Score: ' + str(dt.score(X_test, y_test)))

    ada = AdaBoostRegressor(LinearRegression())
    ada.fit(X_train, y_train)
    print('AdaBoost Regressor Score: ' + str(ada.score(X_test, y_test)))

    # Train and Score Bagged Tree Regressor (ensemble learner)
    bagged_tree = BaggingRegressor(DecisionTreeRegressor())
    bagged_tree.fit(X_train, y_train)
    print('Bagged Tree Score: ' + str(bagged_tree.score(X_test, y_test)))
def test_adaboostregressor_sample_weight():
    # check that giving weight will have an influence on the error computed
    # for a weak learner
    rng = np.random.RandomState(42)
    X = np.linspace(0, 100, num=1000)
    y = (.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001)
    X = X.reshape(-1, 1)

    # add an arbitrary outlier
    X[-1] *= 10
    y[-1] = 10000

    # random_state=0 ensure that the underlying boostrap will use the outlier
    regr_no_outlier = AdaBoostRegressor(base_estimator=LinearRegression(),
                                        n_estimators=1,
                                        random_state=0)
    regr_with_weight = clone(regr_no_outlier)
    regr_with_outlier = clone(regr_no_outlier)

    # fit 3 models:
    # - a model containing the outlier
    # - a model without the outlier
    # - a model containing the outlier but with a null sample-weight
    regr_with_outlier.fit(X, y)
    regr_no_outlier.fit(X[:-1], y[:-1])
    sample_weight = np.ones_like(y)
    sample_weight[-1] = 0
    regr_with_weight.fit(X, y, sample_weight=sample_weight)

    score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1])
    score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1])
    score_with_weight = regr_with_weight.score(X[:-1], y[:-1])

    assert score_with_outlier < score_no_outlier
    assert score_with_outlier < score_with_weight
    assert score_no_outlier == pytest.approx(score_with_weight)
예제 #22
0
from sklearn.tree import DecisionTreeRegressor
##ARBRES DE DECISION
regressor = DecisionTreeRegressor(max_leaf_nodes=9072)
regressor.fit(X_train, Y_train)

#%%

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
##BOOSTING
regBoost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=300), loss='square')
regBoost.fit(X_train, Y_train)

Y_test = regBoost.predict(X_test)

regBoost.score(X_train,Y_train)
regBoost.score(X_test,Y_test)

Y_test = Y_test.astype(int)


#%%

regr_3 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=200), n_estimators=300)
regr_3.fit(X_train,Y_train)
y_3 = regr_3.predict(X_test)


#%%

Y_test = regressor.predict(X_test)
print('Training Score : ',reg_ridge.score(X_train, y_train))
print('Testing Score : ',reg_ridge.score(X_test, y_test))
print('Mean Square Error:',mean_squared_error(y_test, y_pred_ridge))
print('Mean Absolute Error:',mean_absolute_error(y_test, y_pred_ridge))
print('Root Mean Square Error:',mean_squared_error(y_test, y_pred_ridge)**0.5)
r2_ridge=r2_score(y_test, y_pred_ridge)
r2_ridge=1-(((1-r2_ridge)*(n-1))/(n-p-1))
print('R2 adjusted:',r2_ridge)
r2_scores.append(r2_ridge)

#AdaBoost Regression
reg_ada = AdaBoostRegressor(n_estimators=1000)
reg_ada.fit(X_train , y_train)
y_pred_ada = reg_ada.predict(X_test)
print('5.AdaBoost Regression')
print('Training Score : ',reg_ada.score(X_train, y_train))
print('Testing Score : ',reg_ada.score(X_test, y_test))
print('Mean Square Error:',mean_squared_error(y_test, y_pred_ada))
print('Mean Absolute Error:',mean_absolute_error(y_test, y_pred_ada))
print('Root Mean Square Error:',mean_squared_error(y_test, y_pred_ada)**0.5)
r2_ada=r2_score(y_test, y_pred_ada)
r2_ada=1-(((1-r2_ada)*(n-1))/(n-p-1))
print('R2 adjusted:',r2_ada)
r2_scores.append(r2_ada)

#Gradient Boost Regression
reg_gradient = GradientBoostingRegressor(n_estimators=500, learning_rate=0.1,max_depth=1, random_state=0, loss='ls',verbose = 1)
reg_gradient.fit(X_train , y_train)
y_pred_gradient = reg_gradient.predict(X_test)
print('6.Gradient Boosting Regression')
print('Training Score : ',reg_gradient.score(X_train, y_train))
def test_boston():
    # Check consistency on dataset boston house prices.
    clf = AdaBoostRegressor(random_state=0)
    clf.fit(boston.data, boston.target)
    score = clf.score(boston.data, boston.target)
    assert score > 0.85
예제 #25
0
            tmpSCR = randForrC.score(testX, yTest)
        else:
            randForrR.fit(trainX, yTrain)
            tmpSCR = randForrR.score(testX, yTest)
        scores['rand Forest'][label].append(tmpSCR)
        tTOT = time.time() - t0
        times['rand Forest'][label].append(tTOT)

        print("start adaBoost")
        t0 = time.time()
        if cnt < 2:
            adaBoostC.fit(trainX, yTrain)
            tmpSCR = adaBoostC.score(testX, yTest)
        else:
            adaBoostR.fit(trainX, yTrain)
            tmpSCR = adaBoostR.score(testX, yTest)
        scores['adaBoost'][label].append(tmpSCR)
        tTOT = time.time() - t0
        times['adaBoost'][label].append(tTOT)

        t0 = time.time()
        print("start bagging withOUT out-of-bag")
        if cnt < 2:
            bagCoobN.fit(trainX, yTrain)
            tmpSCR = bagCoobN.score(testX, yTest)
        else:
            bagRoobN.fit(trainX, yTrain)
            tmpSCR = bagRoobN.score(testX, yTest)
        scores['bagging (NO out of bag)'][label].append(tmpSCR)
        tTOT = time.time() - t0
        times['bagging (NO out of bag)'][label].append(tTOT)
# In[74]:

from sklearn.ensemble import RandomForestRegressor

# In[75]:

rf = RandomForestRegressor(n_estimators=200, random_state=45)
rf.fit(train_x, train_y)

# In[76]:

pred = rf.predict(test_x)
pred

# In[77]:

from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()
model.fit(train_x, train_y)
print(model.score(train_x, train_y))
abpred = model.predict(test_x)
print(abpred)
model.score(test_x, test_y)

# In[78]:

from sklearn.externals import joblib
joblib.dump(abpred, 'abpredsave.obj')

# In[ ]:
예제 #27
0
def test_boston():
    """Check consistency on dataset boston house prices."""
    clf = AdaBoostRegressor()
    clf.fit(boston.data, boston.target)
    score = clf.score(boston.data, boston.target)
    assert score > 0.85
    regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=8,
                                                   min_samples_split=2,
                                                   random_state=rnd),
                             n_estimators=n,
                             learning_rate=0.1,
                             random_state=rnd,
                             loss='exponential')
    # r = DecisionTreeRegressor(random_state=rnd).get_params('random_state')
    print 'n feat: ', regr.n_features_
    raw_input('ENTER')
    regr.fit(training_data, training_labels)

    # Predict
    y = regr.predict(training_data)
    z = regr.predict(testing_data)
    sy = regr.score(training_data, training_labels)
    sz = regr.score(testing_data, testing_labels)

    training_scores[i] = sy
    testing_scores[i] = sz

    # print ' '
    # print 'Training scores --> n_est = %0.2f , ts = %0.2f : ' % (n, t), sy
    # print 'Testing scores --> n_est = %0.2f , ts = %0.2f: '% (n, t), sz
    # print ' '

    scores_z = cross_val_score(regr, testing_data, testing_labels)
    scores_y = cross_val_score(regr, training_data, training_labels)
    # print ' accuracy training: %0.2f (+/- %0.2f) ' % (scores_y.mean(), scores_y.std() *2)
    # print ' accuracy testing: %0.2f (+/- %0.2f) ' % (scores_z.mean(), scores_z.std() *2)
예제 #29
0
                                                    test_size=0.1,
                                                    random_state=42)
X_train

# In[ ]:

# In[ ]:

# In[ ]:

# In[40]:

from sklearn.ensemble import AdaBoostRegressor
Ada_reg = AdaBoostRegressor(random_state=42)
Ada_reg.fit(X_train, y_train)
Ada_reg.score(X_train, y_train)

# In[ ]:

# In[41]:

Ada_reg.get_params

# In[42]:

param_grid_xg = [
    {
        'n_estimators': [45, 50, 55],
        'learning_rate': [0.75, 1, 1.25]
    },
]
예제 #30
0
                    param_grid=parameters_ada,
                    cv=5,
                    scoring='neg_mean_squared_error')

clf2.fit(X_train, y_train)

print clf2.best_params_

regr2 = AdaBoostRegressor(regr1,
                          n_estimators=100,
                          loss='exponential',
                          learning_rate=0.7)

regr2.fit(X_train, y_train)

print regr2.score(X_test, y_test)
""" Ridge model """
ridge_regr = Ridge()
parameters_ridge = {
    'alpha': [0.25, 0.5, 0.75, 1, 1.25, 2],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag']
}

ridge_regr = GridSearchCV(ridge_regr,
                          param_grid=parameters_ridge,
                          cv=5,
                          scoring='neg_mean_squared_error')

ridge_regr.fit(X_train, y_train)

ridge_regr.best_params_
예제 #31
0
    train_Salary, train_Salary_y = preprocess(all_cat, all_num, all_y)

    # AdaBoostRegressor
    ridge = Ridge(15)
    trainX_Sal, testX_Sal, trainy_Sal, testy_Sal = train_test_split(
        train_Salary, train_Salary_y, test_size=0.1, random_state=1)
    clf_Ada = AdaBoostRegressor(n_estimators=10, base_estimator=ridge)
    clf_Ada.fit(trainX_Sal, trainy_Sal)
    # scores = clf_Ada.score(testXF,testyF)
    y_Sal = clf_Ada.predict(testX_Sal)
    # print y_Sal
    sal_pred = scaling(y_Sal)
    print sal_pred
    scores_Sal_C = cross_val_score(clf_Ada, train_Salary, train_Salary_y)
    scores_Sal_CV = np.mean(scores_Sal_C)
    scores_Sal = clf_Ada.score(testX_Sal, testy_Sal)
    print 'AdaBoostRegression:', scores_Sal
    print 'AdaBoostRegression_cv:', scores_Sal_CV
    print 'finished with the mean-Salary(平均工资预测ok)'

    #预测股票数据流通股本
    dataXFCA = pd.read_csv('/Users/huanghuaixian/desktop/final.csv',
                           encoding="GBK")
    data_cat_df = dataXFCA[[
        'area', 'province', 'city', 'year', 'month', 'day', 'industry'
    ]].astype(str)
    y_data = dataXFCA['fcA']
    data_num_df = dataXFCA[['gcA']]
    train, y_data = preprocess(data_cat_df, data_num_df, y_data)
    trainXF, testXF, trainyF, testyF = train_test_split(train,
                                                        y_data,
예제 #32
0
def test_boston():
    """Check consistency on dataset boston house prices."""
    clf = AdaBoostRegressor(random_state=0)
    clf.fit(boston.data, boston.target)
    score = clf.score(boston.data, boston.target)
    assert score > 0.85
trainlabel = pd.read_csv("Produce_Data/University_data_cluster.csv")
#use different regression methods
est = AdaBoostRegressor(DecisionTreeRegressor())

col_predic = ["UniversityNo","Topic","Year","Lowest","Last_Ranking","Average_Ranking"] # parameter use for label training
df2 = df.merge(trainlabel[col_predic],on=["UniversityNo","Topic","Year"])

df2 = df2[~np.isnan(df2.Lowest)]
df2 = df2[~(df2["Average_Ranking"] == 0)]
X = df2[["UniversityNo","Year","Topic","Lowest","Ranking_Scores","Last_Ranking","Average_Ranking"]] #The train parameters label

y = df2.Label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
est.fit(X_train,y_train)
print "----------------"
print "Prediction score: " + str(round(est.score(X_test,y_test)*1000)/10) + "%"
print "----------------"

df2.New_Label = est.predict(X) #obtain the prediction label for future ranking prediction
#sum up the enroll student number equal to smaller than the label, the predicting ranking numbers are mainly base on this parameter

for y in range(2011,2016): #no 2010 because no average ranking in it
	for t in range(2):
		s = 0
		while sum(df2.Label >= s):
			df2.loc[(df2.Year == y) & (df2.Topic == t) & (df2.Label >= s),'Plan_Number_Total'] += sum(df2.loc[(df2.Year == y) & (df2.Topic == t) & (np.round(df2.Label) ==s),'Plan_Number'])
			s += 1
			
dfsave = df2
dfsave.to_csv("Produce_Data/University.csv")
# Random Forest Regression
import numpy as np
from sklearn import datasets
from sklearn.ensemble import AdaBoostRegressor
# load the diabetes datasets
dataset = datasets.load_diabetes()
# fit an AdaBoost model to the data
model = AdaBoostRegressor()
model.fit(dataset.data, dataset.target)
print(model)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
mse = np.mean((predicted-expected)**2)
print(mse)
print(model.score(dataset.data, dataset.target))
예제 #35
0
for i in index:
    if scores2[i]<0.88:
        list_index2.append(list_index[i])

n=len(list_index2)
b_train=True
b_test=True


for i in range(n):
    b_train=b_train&(X_train[:,index_adt]==np.unique(X_train[:,index_adt])[list_index2[i]])
    b_test=b_test&(X_test[:,index_adt]==np.unique(X_test[:,index_adt])[list_index2[i]])

reg2=AdaBoostRegressor(RandomForestRegressor())
reg2.fit(X_train[b_train],y_train[b_train])
reg2.score(X_test[b_test],y_test[b_test])
Qt[b_train]=Qt[b_train]-reg2.predict(X_train[b_train])
Q[b_test]=Q[b_test]-reg2.predict(X_test[b_test])
Q2[b_test]=reg2.predict(X_test[b_test])

for i in range(n):
    s="Pred/fit_adt_"+str(list_index2[i]+1)+".pickle"
    fid = open(s, 'wb') 
    pickle.dump(reg2,fid)
    fid.close()



r=AdaBoostRegressor(RandomForestRegressor())
r.fit(X_train,Qt)
r.score(X_test,Q)
예제 #36
0
# Fit regression model
regr_dt = DecisionTreeRegressor(criterion='mse', max_depth=4)
regr_abdt = AdaBoostRegressor(
    DecisionTreeRegressor(max_depth=4),
    n_estimators=300,
)

regr_dt.fit(X_train_std, y_train)
regr_abdt.fit(X_train_std, y_train)

Rsquare_dt = regr_dt.score(X_train_std, y_train)
mse_dt = mean_squared_error(y_test.values, y_test_predict_dt)
y_train_predict_dt = regr_dt.predict(X_train_std)

Rsquare_abdt = regr_abdt.score(X_train_std, y_train)
mse_abdt = mean_squared_error(y_test.values, y_test_predict_abdt)
y_train_predict_abdt = regr_abdt.predict(X_train_std)

y_test_predict_dt = regr_dt.predict(X_test_std)
y_test_predict_abdt = regr_abdt.predict(X_test_std)

# In[ ]:

# boosting建模

# In[43]:

regr_gb = GradientBoostingRegressor(n_estimators=100, max_depth=1, loss='ls')

regr_gb.fit(X_train_std, y_train)
예제 #37
0
파일: Adaboost.py 프로젝트: camouflage/ML
if __name__ == '__main__':
    np.set_printoptions(edgeitems=5)

    # Read dataset
    data = np.genfromtxt("shuffled.csv", delimiter=',', skip_header=1, usecols=range(1, 385))
    reference = np.genfromtxt("shuffled.csv", delimiter=',', skip_header=1, usecols=(385))
    testData = np.genfromtxt("test.csv", delimiter=',', skip_header=1, usecols=range(1, 385))
    validationData = np.genfromtxt("train.csv", delimiter=',', skip_header=1, usecols=range(1, 385), max_rows=5000)
    validationReference = np.genfromtxt("train.csv", delimiter=',', skip_header=1, usecols=(385), max_rows=5000)

    numberOfTrainingData = data.shape[0]
    numberOfFeatures = data.shape[1]
    numberOfTestData = testData.shape[0]
    numberOfVldtData = validationData.shape[0]

    # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
    bdt = AdaBoostRegressor(base_estimator=ExtraTreeRegressor(), n_estimators=1000)
    #bdt = RandomForestRegressor(n_estimators=50)
    #bdt = GradientBoostingRegressor()

    bdt.fit(data, reference)
    print("FINISH FITTING")
    predict = bdt.predict(testData).reshape(numberOfTestData, 1)
    score = bdt.score(validationData, validationReference)
    print(score)

    with open('adaboostResult.csv', 'w') as file:
        file.write("id,reference\n")
        for i in range(0, numberOfTestData):
            file.write("%d,%f\n" %(i, predict[i]))
예제 #38
0
def adaboost_regressor(train_data, train_label, test_data, test_label,
                       parameters):
    min_error = 10000000000
    error = []
    learn_rate = [1e-2, 1e-1, 1, 10, 100, 500, 1000]
    n_est = [20, 40, 60, 80, 100]
    comb = list(itertools.product(learn_rate, n_est))
    # print comb
    fin_learn = 0
    fin_est = 0
    for i in range(0, len(comb)):
        regr = AdaBoostRegressor(n_estimators=comb[i][1],
                                 learning_rate=comb[i][0],
                                 random_state=random_state)
        regr.fit(train_data, train_label)
        predict = regr.predict(test_data)
        predict = map(lambda x: [x], predict)
        mse = MSE(np.array(predict), test_label)
        error.append(mse)
        # print mse[0]
        if (mse[0] < min_error):
            min_error = mse[0]
            # print comb[i]
            fin_learn = comb[i][0]
            fin_est = comb[i][1]
        else:
            continue

    plt.figure(figsize=(10, 12))
    plt.title('MSE vs (learning rate, n_estimate)')
    plt.plot(range(len(comb)), error)
    plt.xticks(np.arange(len(comb)), comb, rotation=90)
    plt.xlabel('(learning rate, n_estimate)')
    plt.ylabel('MSE')
    directory = './adaboost/'
    if not os.path.exists(directory):
        os.makedirs(directory)
    plt.savefig(directory + 'MSE' + parameters + '.png')
    plt.close()
    regr = AdaBoostRegressor(n_estimators=80,
                             learning_rate=1,
                             random_state=random_state)
    regr.fit(train_data, train_label)
    score = regr.score(test_data, test_label)
    predict = regr.predict(test_data)
    predict = map(lambda x: [x], predict)
    predict = np.array(predict)
    mse = MSE(np.array(predict), test_label)
    print 'MSE ' + parameters + ' ' + str(mse[0])
    df = pd.Series(predict.flatten(), index=test_label.index)
    price = train_label.append(test_label)
    plt.title('AdaBoost on ' + parameters)
    plt.plot(price[1000:-1], label='actual price')
    plt.plot(df, label='predicted price')
    plt.legend(loc='lower right')
    plt.xlabel('Dates')
    plt.ylabel('Price')
    # plt.show()
    directory = './adaboost/'
    if not os.path.exists(directory):
        os.makedirs(directory)
    plt.savefig(directory + parameters + '.png')
    plt.close()
    return score
예제 #39
0
#trn2=train
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trn2[feature_cols], trn2['Hazard'], random_state=1)


#fit the model and predict
# model = AdaBoostRegressor(base_estimator=RandomForestRegressor())
model = AdaBoostRegressor()
model.fit(X_train,y_train)
y_pred =model.predict(X_test)

coef = giniscore.Gini(y_pred,y_test)

print 'Gini coefficient is ', coef

model.score(X_train,y_train)
# score with 100 rows RF estimator is .92

# gini with default columns, default estimator is 0.12 
# gini with 1000 rows all columns, default estimater is 0.188
# gini with 10000 rows all columns, default estimater is 0.1802
# gini with all rows all columns, default estimater is 0.12759

# gini with 100 rows RF esimator is .098
# gini with 1000 rows RF estimator is .0876

# ugh, using LassoCV 32 the score is only .19 to 21 so it must want all the columns

# benchmark is .20 , Kaggle public LB says .263387
# < 14 97.5% benchmark is .172
# < 10 90.5% benchmark is .1472
예제 #40
0
mse = mean_absolute_error(y_test, y_pred)
print("The mean absolute error is:$", mse)
#chceking r^2
from sklearn.metrics import r2_score

print("r_Score:", r2_score(y_test, y_pred))

bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10)
bg.fit(X_train, y_train)
bg.score(X_train, y_train)
bg.score(X_test, y_test)

#Adaboosting
regr = AdaBoostRegressor()
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

#Decision
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

#gradientBoost
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
gb.score(X_train, y_train)
gb.score(X_test, y_test)

#KNN
예제 #41
0
    print "train score: ", dtr.score(data_0am_train_xx,data_0am_train_yy)
    print "train error: " , np.sqrt(np.mean((data_0am_train_predy-data_0am_train_yy)**2))/nom_train
    print "test error: ",  np.sqrt(np.mean((data_0am_test_predy-data_0am_test_y)**2))/nom_test


    rng = np.random.RandomState(1)
    abr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=5),
                          n_estimators=300, random_state=rng)
    abr.fit(data_0am_train_xx,data_0am_train_yy)
    data_0am_train_predy = abr.predict(data_0am_train_xx)
    abr_train_predy = abr.predict(data_0am_train_xx)

    data_0am_test_predy = abr.predict(data_0am_test_x)
    abr_test_predy = abr.predict(data_0am_test_x)
    print "ABR report"
    print "train score: ", abr.score(data_0am_train_xx,data_0am_train_yy)
    print "train error: " , np.sqrt(np.mean((data_0am_train_predy-data_0am_train_yy)**2))/nom_train
    print "test error: ",  np.sqrt(np.mean((data_0am_test_predy-data_0am_test_y)**2))/nom_test


    # print lasso_train_predy.shape
    combine_train_predy = np.concatenate((
                                          np.atleast_2d(linear_train_predy),
                                          np.atleast_2d(lasso_train_predy),
                                          np.atleast_2d(DTR_train_predy),
                                          np.atleast_2d(svr_train_predy),
                                          np.atleast_2d(abr_train_predy)),axis=0)
    # print combine_train_predy.shape
    combine_train_predy= np.mean(combine_train_predy,axis=0)
    # print combine_train_predy.shape