예제 #1
1
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = AdaBoostRegressor(RandomForestRegressor(n_estimators=100, max_depth=40, max_features=25), n_estimators=100)
        #self.clf_Boost = GradientBoostingRegressor( n_estimators = 500 , max_features = 20 )
        #self.clf_Regression = LinearRegression()
        

    def fit(self, X, y):
        self.clf.fit(X,y)

    def predict(self, X):
        return self.clf.predict(X)
예제 #2
0
def train_learning_model_decision_tree_ada_boost(df):
    #code taken from sklearn
    X_all, y_all = preprocess_data(df)
    X_train, X_test, y_train, y_test = split_data(X_all, y_all)

    tree_regressor = DecisionTreeRegressor(max_depth = 6)
    ada_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators = 500, learning_rate = 0.01, random_state = 1)

    tree_regressor.fit(X_train, y_train)
    ada_regressor.fit(X_train, y_train)

    y_pred_tree = tree_regressor.predict(X_test)
    y_pred_ada = ada_regressor.predict(X_test)
    
    mse_tree = mean_squared_error(y_test, y_pred_tree)
    mse_ada = mean_squared_error(y_test, y_pred_ada)

    mse_tree_train = mean_squared_error(y_train, tree_regressor.predict(X_train))
    mse_ada_train = mean_squared_error(y_train, ada_regressor.predict(X_train))
    
    print ("MSE tree: %.4f " %mse_tree)
    print ("MSE ada: %.4f " %mse_ada)

    print ("MSE tree train: %.4f " %mse_tree_train)
    print ("MSE ada train: %.4f " %mse_ada_train)
예제 #3
0
def main():



    ab = AdaBoostRegressor(base_estimator=None, n_estimators=50, 
                            learning_rate=1.0, loss='exponential', 
                            random_state=None)  

    ab.fit(X_train, y_train)

    #Evaluation in train set
    #Evaluation in train set
    pred_proba_train = ab.predict(X_train)
        
    mse_train = mean_squared_error(y_train, pred_proba_train)
    rmse_train = np.sqrt(mse_train)
    logloss_train = log_loss(y_train, pred_proba_train)
    
    #Evaluation in validation set
    pred_proba_val = ab.predict(X_val)
        
    mse_val = mean_squared_error(y_val, pred_proba_val)
    rmse_val = np.sqrt(mse_val)
    logloss_val = log_loss(y_val, pred_proba_val)
    
    rmse_train
    rmse_val
    logloss_train
    logloss_val
예제 #4
0
파일: predictor.py 프로젝트: atremblay/MLND
    def fit(self, start_date, end_date):

        for ticker in self.tickers:
            self.stocks[ticker] = Stock(ticker)

        params_ada = [{
            'n_estimators': [25, 50, 100],
            'learning_rate': [0.01, 0.1, 1, 10],
            'loss': ['linear', 'square', 'exponential']
            }]

        params = ParameterGrid(params_ada)

        # Find the split for training and CV
        mid_date = train_test_split(start_date, end_date)
        for ticker, stock in self.stocks.items():

            X_train, y_train = stock.get_data(start_date, mid_date, fit=True)
            # X_train = self.pca.fit_transform(X_train.values)
            X_train = X_train.values
            # pdb.set_trace()
            X_cv, y_cv = stock.get_data(mid_date, end_date)
            # X_cv = self.pca.transform(X_cv.values)
            X_cv = X_cv.values

            lowest_mse = np.inf
            for i, param in enumerate(params):
                ada = AdaBoostRegressor(**param)
                ada.fit(X_train, y_train.values)
                mse = mean_squared_error(
                    y_cv, ada.predict(X_cv))
                if mse <= lowest_mse:
                    self.models[ticker] = ada

        return self
예제 #5
0
def ada_boost_regressor(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True):
    """
    :param train_x: train
    :param train_y: text
    :param pred_x: test set to predict
    :param review_id: takes in a review id
    :param v_curve: run the model for validation curve
    :param l_curve: run the model for learning curve
    :param get_model: run the model
    :return: the predicted values,learning curve, validation curve
    """
    ada = AdaBoostRegressor(n_estimators=5)
    if get_model:
        print "Fitting Ada..."
        ada.fit(train_x, np.log(train_y+1))
        ada_pred = np.exp(ada.predict(pred_x))-1
        Votes = ada_pred[:,np.newaxis]
        Id = np.array(review_id)[:,np.newaxis]
        # create submission csv for Kaggle
        submission_ada= np.concatenate((Id,Votes),axis=1)
        np.savetxt("submission_ada.csv", submission_ada,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='')
    # plot validation and learning curves
    if l_curve:
        print "Working on Learning Curves"
        plot_learning_curve(AdaBoostRegressor(), "Learning curve: Adaboost", train_x, np.log(train_y+1.0))
    if v_curve:
        print "Working on Validation Curves"
        plot_validation_curve(AdaBoostRegressor(), "Validation Curve: Adaboost", train_x, np.log(train_y+1.0),
                              param_name="n_estimators", param_range=[2, 5, 10, 15, 20, 25, 30])
예제 #6
0
def round2(X_df, featurelist):
    # Set parameters
    model = AdaBoostRegressor()
    y_df = X_df['target']
    n = len(y_df)

    # Perform 5-fold cross validation
    scores = []
    kf = KFold(n, n_folds=5, shuffle=True)

    # Calculate mean absolute deviation for train/test for each fold
    for train_idx, test_idx in kf:
        X_train, X_test = X_df.iloc[train_idx, :], X_df.iloc[test_idx, :]
        # y_train, y_test = y_df[train_idx], y_df[test_idx]

        X_train, X_test = applyFeatures(X_train, X_test, featurelist)
        Xtrain_array, ytrain_array, Xtest_array, ytest_array = dfToArray(X_train, X_test)
        model.fit(Xtrain_array, ytrain_array)
        prediction = model.predict(Xtest_array)
        rmse = np.sqrt(mean_squared_error(ytest_array, prediction))
        scores.append(rmse)
        print rmse
        print "Finish fold"

    return scores
예제 #7
0
def Round2(X, y):
    # Set parameters
    min_score = {}
    for loss in ['linear', 'square', 'exponential']:
        model = AdaBoostRegressor(loss=loss)
        n = len(y)

        # Perform 5-fold cross validation
        scores = []
        kf = KFold(n, n_folds=5, shuffle=True)

        # Calculate mean absolute deviation for train/test for each fold
        for train_idx, test_idx in kf:
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            model.fit(X_train, y_train)
            prediction = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, prediction))
            # score = model.score(X_test, y_test)
            scores.append(rmse)
        if len(min_score) == 0:
            min_score['loss'] = loss
            min_score['scores'] = scores
        else:
            if np.mean(scores) < np.mean(min_score['scores']):
                min_score['loss'] = loss
                min_score['scores'] = scores

        print "Loss:", loss
        print scores
        print np.mean(scores)
    return min_score
예제 #8
0
    def predict(tour_data):

        vec = DictVectorizer()

        tour_data = get_tour_data()

        transformed = vec.fit_transform(tour_data).toarray()
        categories = vec.get_feature_names()

        y = transformed[:,[categories.index('rating')]]
        X = transformed[:,np.arange(transformed.shape[1])!=categories.index('rating')]

        reg_tree = DecisionTreeRegressor()

        addboost_tree = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                              n_estimators=300, random_state=rng)

        red_tree.fit(X,y)
        addboost_tree(X,y)

        # Predict
        y_1 = red_tree.predict(X)
        y_2 = addboost_tree.predict(X)

        return prediction
def predict_volatility_1year_ahead(rows, day, num_days):
    """
    SUMMARY: Predict volatility 1 year into the future
    ALGORITHM:
      a) The predictor will train on all data up to exactly 1 year (252 trading days) before `day`
      b) The newest 10 days up to and including `day` will be used as the feature vector for the prediction
         i.e. if day = 0, then the feature vector for prediction will consist of days (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
              if day = 10, then the feature vector for predictor input will be days (10, 11, 12, 13, 14, 15, 16, 17, 19)
    INPUT: minimum of (1 year + 10 days) of data before `day` (newest data is day=0)
  
    """

    '''enforce that `day` is in the required range'''
    assert len(rows) >= 252+num_days + day, 'You need to have AT LEAST 252+%d rows AFTER the day index. See predict_volatility_1year_ahead() for details.' % num_days
    assert day >= 0

    '''Compile features for fitting'''
    feature_sets = []
    value_sets = [];
    for ii in range(day+num_days+252, len(rows) - num_days):
        features = []
        for jj in range(num_days):
            day_index = ii + jj
            features += [
        	float(rows[day_index][7]), 
        	float(rows[day_index][8]),
        	float(rows[day_index][9]), 
        	float(rows[day_index][10]),
        	float(rows[day_index][11]),
        	float(rows[day_index][12]),
        	float(rows[day_index][13]),
            ]
            #print("issue here: " + str(rows[day_index][0]))
        feature_sets += [features]
        value_sets += [float(rows[ii-252][9])]

    '''Create Regressor and fit'''
    num_features = 16
    rng = np.random.RandomState(1)
    regr = AdaBoostRegressor(CustomClassifier(), n_estimators=3, random_state=rng)
    regr.fit(feature_sets, value_sets)

    '''Get prediction features'''
    ii = day
    features = []
    for jj in range( num_days ):
        day_index = ii + jj   
        features += [
        float(rows[day_index][7]), 
        float(rows[day_index][8]),
        float(rows[day_index][9]), 
        float(rows[day_index][10]),
        float(rows[day_index][11]),
        float(rows[day_index][12]),
        float(rows[day_index][13]),
        ]
        
    return float(regr.predict([features]))
예제 #10
0
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = AdaBoostRegressor(RandomForestRegressor(n_estimators=500, max_depth=78, max_features=10), n_estimators=40)

    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)
예제 #11
0
class Regressor(BaseEstimator):
    def __init__(self):
        cl = RandomForestRegressor(n_estimators=10, max_depth=10, max_features=10)
        self.clf = AdaBoostRegressor(base_estimator = cl, n_estimators=100)

    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)
#RandomForestClassifier
def AdaBoost(xTrain, yTrain, xTest, yTest, treeNum):
	rms = dict()
	for trees in treeNum:
		ab = AdaBoostRegressor(n_estimators = trees)
		ab.fit(xTrain, yTrain)
		yPred = ab.predict(xTest)
		rms[trees] = sqrt(mean_squared_error(yTest, yPred))

	(bestRegressor, rmse) = sorted(rms.iteritems(), key = operator.itemgetter(1))[0]

	return bestRegressor, rmse
예제 #13
0
def performAdaBoostReg(train, test, features, output):
    """
    Ada Boost Regression
    """

    clf = AdaBoostRegressor()
    clf.fit(train[features], train[output])
    Predicted = clf.predict(test[features])
    
    plt.plot(test[output])
    plt.plot(Predicted, color='red')
    plt.show()        
    
    return mean_squared_error(test[output],Predicted), r2_score(test[output], Predicted)
예제 #14
0
def do_adaboost(filename):
    df, Y = create_merged_dataset(filename)
    # Ideas:
    # Create a feature for accelerations e deacceleration.

    # Leave default base regressor for AdaBoost(decision tree). Extra trees were tried with catastrophic results.
    #ada = AdaBoostRegressor(n_estimators=350, learning_rate=0.05)
    ada = AdaBoostRegressor(n_estimators=500, learning_rate=1)
    
    #X = df.drop(['driver', 'trip', 'prob_points', 'prob_speed', 'prob_distance', 'prob_acceleration'], 1)
    X = df.drop(['driver', 'trip'], 1)
    ada.fit(X, Y)
    probs = ada.predict(X[:200])
    return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
예제 #15
0
def test_sparse_regression():
    """Check regression with sparse input."""

    class CustomSVR(SVR):
        """SVR variant that records the nature of the training set."""

        def fit(self, X, y, sample_weight=None):
            """Modification on fit caries data type for later verification."""
            super(CustomSVR, self).fit(X, y, sample_weight=sample_weight)
            self.data_type_ = type(X)
            return self

    X, y = datasets.make_regression(n_samples=100, n_features=50, n_targets=1,
                                    random_state=42)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
                          dok_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        # Trained on sparse format
        sparse_classifier = AdaBoostRegressor(
            base_estimator=CustomSVR(probability=True),
            random_state=1
        ).fit(X_train_sparse, y_train)

        # Trained on dense format
        dense_classifier = dense_results = AdaBoostRegressor(
            base_estimator=CustomSVR(probability=True),
            random_state=1
        ).fit(X_train, y_train)

        # predict
        sparse_results = sparse_classifier.predict(X_test_sparse)
        dense_results = dense_classifier.predict(X_test)
        assert_array_equal(sparse_results, dense_results)

        # staged_predict
        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
        dense_results = dense_classifier.staged_predict(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        sparse_type = type(X_train_sparse)
        types = [i.data_type_ for i in sparse_classifier.estimators_]

        assert all([(t == csc_matrix or t == csr_matrix)
                   for t in types])
def predict_volatility_1year_ahead(rows, day):
    """
    SUMMARY: Predict volatility 1 year into the future
    ALGORITHM:
      a) The predictor will train on all data up to exactly 1 year (252 trading days) before `day`
      b) The newest 10 days up to and including `day` will be used as the feature vector for the prediction
         i.e. if day = 0, then the feature vector for prediction will consist of days (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
              if day = 10, then the feature vector for predictor input will be days (10, 11, 12, 13, 14, 15, 16, 17, 19)
    INPUT: minimum of (1 year + 10 days) of data before `day` (newest data is day=0)
  
    """

    #num_days = 10
    num_days = 10

    # enforce that `day` is in the required range
    assert len(rows) >= 252+num_days + day, 'You need to have AT LEAST 252+%d rows AFTER the day index. See predict_volatility_1year_ahead() for details.' % num_days
    assert day >= 0

    # compile features (X) and values (Y) 
    feature_sets = []
    value_sets = []; value_sets_index = []
    for ii in range(day+252, len(rows) - num_days):
        features = []
        for jj in range(num_days):
            day_index = ii + jj
            features += [float(rows[day_index][1]), float(rows[day_index][2]), float(rows[day_index][3]), float(rows[day_index][5]), float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9])]
        feature_sets += [features]
        value_sets += [float(rows[ii-252][9])]
        value_sets_index.append([ii-252])
             
    # fit
    #regr = linear_model.Lasso(alpha=0.01,fit_intercept=False,normalize=False,max_iter=10000000)   # they call lambda alpha
    rng = np.random.RandomState(1)
    regr = AdaBoostRegressor(CustomClassifier(), n_estimators=4, random_state=rng)
    #regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=2, random_state=rng)
    #regr =  DecisionTreeRegressor(max_depth=4)
    regr.fit(feature_sets, value_sets)
    

    #print "Adaboost weights:", regr.estimator_weights_

    ii = day
    features = []
    for jj in range( num_days ):
        day_index = ii + jj    +252    
        features += [float(rows[day_index][1]), float(rows[day_index][2]), float(rows[day_index][3]), float(rows[day_index][5]), float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9])]

    return float(regr.predict([features]))
예제 #17
0
def ada_boost(df, significant_cols, target, cat_cols, num_cols):
    ss = StandardScaler()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X = df[significant_cols]
    y = df[target]
    base = DecisionTreeRegressor(max_depth=3, random_state=0)
    estimator = AdaBoostRegressor(base_estimator=base, random_state=0)
    params = {
        'n_estimators': np.arange(5, int(X.shape[0] * 0.1)),
        'learning_rate': np.arange(0.1, 1.1, 0.1),
        'loss': ['linear', 'square', 'exponential'],
    }
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=0)
    X_train_cat = ohe.fit_transform(X_train[cat_cols])
    X_train_num = ss.fit_transform(X_train[num_cols])
    X_test_cat = ohe.transform(X_test[cat_cols])
    X_test_num = ss.transform(X_test[num_cols])
    train_data = np.c_[X_train_cat, X_train_num]
    test_data = np.c_[X_test_cat, X_test_num]
    gs = GridSearchCV(estimator, params, scoring='r2', cv=3)
    gs.fit(train_data, y_train)
    estimator = gs.best_estimator_
    r2_cv_scores = cross_val_score(estimator,
                                   train_data,
                                   y_train,
                                   scoring='r2',
                                   cv=3,
                                   n_jobs=-1)
    rmse_cv_scores = cross_val_score(estimator,
                                     train_data,
                                     y_train,
                                     scoring='neg_root_mean_squared_error',
                                     cv=3,
                                     n_jobs=-1)
    params = estimator.get_params()
    r2 = np.mean(r2_cv_scores)
    rmse = np.abs(np.mean(rmse_cv_scores))
    r2_variance = np.var(r2_cv_scores, ddof=1)
    rmse_variance = np.abs(np.var(rmse_cv_scores, ddof=1))
    estimator.fit(train_data, y_train)
    y_pred = estimator.predict(test_data)
    r2_validation = r2_score(y_test, y_pred)
    rmse_validation = np.sqrt(mean_squared_error(y_test, y_pred))
    return r2, rmse, r2_variance, rmse_variance, r2_validation, rmse_validation, params
예제 #18
0
def train_predict_loan11_2_20(trainData,
                              predictData,
                              maxdepth=50,
                              goaladdress=None):
    trainData = pd.read_csv(trainData)
    X_feature = np.array(trainData.iloc[:, 2:])
    X_uidsum = np.array(trainData.iloc[:, 0:2])
    # X_train, X_test, y_train, y_test = train_test_split(X_feature,X_uidsum, test_size=0.3)
    X_train = X_feature
    y_train = X_uidsum[:, 1]

    rng = np.random.RandomState(1)
    regAdaBoost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=maxdepth),
                                    n_estimators=1000,
                                    random_state=rng)
    regRandomForest = RandomForestRegressor(max_depth=maxdepth,
                                            n_estimators=1000,
                                            random_state=rng)
    regXGB = xgb.XGBRegressor(max_depth=maxdepth,
                              n_estimators=1000,
                              random_state=1)
    regBagg = BaggingRegressor(DecisionTreeRegressor(max_depth=maxdepth),
                               n_estimators=1000,
                               random_state=rng)

    regAdaBoost.fit(X_train, y_train)
    regRandomForest.fit(X_train, y_train)
    regXGB.fit(X_train, y_train)
    regBagg.fit(X_train, y_train)

    predictData = pd.read_csv(predictData)
    X_test_feature = np.array(predictData.iloc[:, 1:])
    X_test_uidsum = np.array(predictData.iloc[:, 0:1])

    y1 = regAdaBoost.predict(X_test_feature)
    y2 = regRandomForest.predict(X_test_feature)
    y3 = regXGB.predict(X_test_feature)
    y4 = regBagg.predict(X_test_feature)

    pd1 = pd.DataFrame(X_test_uidsum, columns=["uid"])
    pd2 = pd.DataFrame(y1, columns=["AdaBoost_pre"])
    pd3 = pd.DataFrame(y2, columns=["RandomFrost_pre"])
    pd4 = pd.DataFrame(y3, columns=["XGB"])
    pd5 = pd.DataFrame(y4, columns=["Bagg"])
    pdd = pd.concat([pd1, pd2, pd3, pd4, pd5], axis=1)
    # pdd.to_csv(goaladdress, index=False)
    return pdd
예제 #19
0
def test_sparse_regression():
    """Check regression with sparse input."""
    class CustomSVR(SVR):
        """SVR variant that records the nature of the training set."""
        def fit(self, X, y, sample_weight=None):
            """Modification on fit caries data type for later verification."""
            super(CustomSVR, self).fit(X, y, sample_weight=sample_weight)
            self.data_type_ = type(X)
            return self

    X, y = datasets.make_regression(n_samples=15,
                                    n_features=50,
                                    n_targets=1,
                                    random_state=42)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for sparse_format in [
            csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix
    ]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        # Trained on sparse format
        sparse_classifier = AdaBoostRegressor(
            base_estimator=CustomSVR(probability=True),
            random_state=1).fit(X_train_sparse, y_train)

        # Trained on dense format
        dense_classifier = dense_results = AdaBoostRegressor(
            base_estimator=CustomSVR(probability=True),
            random_state=1).fit(X_train, y_train)

        # predict
        sparse_results = sparse_classifier.predict(X_test_sparse)
        dense_results = dense_classifier.predict(X_test)
        assert_array_equal(sparse_results, dense_results)

        # staged_predict
        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
        dense_results = dense_classifier.staged_predict(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        types = [i.data_type_ for i in sparse_classifier.estimators_]

        assert all([(t == csc_matrix or t == csr_matrix) for t in types])
예제 #20
0
    def AdaBoost(self, args):  ## Adaptive Boosting

        logger.info("Running Adaptive Boosting ... ")
        
        # Initialilze the ababoost regressor 
        if args.predictor.lower() == 'classifier': 
            from sklearn.tree import DecisionTreeClassifier as dtree
            from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor 
        elif args.predictor.lower() == 'regressor':
            from sklearn.tree import  DecisionTreeRegressor as dtree
            from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor 
        
        dtree_model = dtree(max_depth= 3)
        
        if args.predictor.lower() == 'classifier': 

            ada_model = AdaBoostRegressor(base_estimator = dtree_model, 
                                            n_estimators=20000,
                                            loss =  'linear',
                                            random_state= 23
                                            )

        elif args.predictor.lower() == 'regressor':
            ada_model = AdaBoostRegressor(base_estimator = dtree_model, 
                                            n_estimators=20000,
                                            loss =  'exponential',
                                            random_state= 23,
                                            learning_rate = 0.1
                                            )
        
       
        

        # Fit ada to the training set
        ada_model.fit(self.X_train, self.y_train)

        # Get the predicted values 
        self.y_pred = ada_model.predict(self.X_data)

        ## The inverse logit transform, \mathrm{invlogit}(x) = \frac{1}{1 + \exp(-x)}, is given in R by: plogis(x)
        if args.predictor.lower() == 'regressor':
            self.y_pred = logistic.cdf(self.y_pred) 
    
        self.data['boosting_score'] = self.y_pred
        self.model = ada_model
        return self
    def test_onnxt_iris_adaboost_regressor_dt(self):
        iris = load_iris()
        X, y = iris.data, iris.target
        X_train, X_test, y_train, __ = train_test_split(X, y, random_state=11)
        y_train = y_train.astype(numpy.float32)
        clr = AdaBoostRegressor(
            base_estimator=DecisionTreeRegressor(max_depth=3), n_estimators=3)
        clr.fit(X_train, y_train)
        X_test = X_test.astype(numpy.float32)
        X_test = numpy.vstack([X_test[:3], X_test[-3:]])
        res0 = clr.predict(X_test).astype(numpy.float32)

        model_def = to_onnx(clr, X_train.astype(numpy.float32))

        oinf = OnnxInference(model_def, runtime='python')
        res1 = oinf.run({'X': X_test})
        self.assertEqualArray(res0, res1['variable'].ravel())
예제 #22
0
def main():
    # 使用带AdaBoost算法的决策树回归器

    # 调用房屋价格数据接口 每个数据点含有13个输入参数
    housing_data = datasets.load_boston()
    # .data 获取输入参数, .target获取对应价格
    x, y = shuffle(housing_data.data, housing_data.target, random_state=7)
    # 分成训练集和测试集
    num_training = int(0.8 * len(x))
    x_train, y_train = x[:num_training], y[:num_training]
    x_test, y_test = x[num_training:], y[num_training:]

    # 拟合一个决策回归模型
    dt_regressor = DecisionTreeRegressor(max_depth=4)
    dt_regressor.fit(x_train, y_train)
    # 用带AdaBoost算法的决策回归模型拟合
    ab_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                                     n_estimators=400,
                                     random_state=7)
    ab_regressor.fit(x_train, y_train)

    # 评价决策树回归器的效果
    y_pred_dt = dt_regressor.predict(x_test)
    mse = mean_squared_error(y_test, y_pred_dt)
    evs = explained_variance_score(y_test, y_pred_dt)
    print("\n###  Decision Tree performance ####")
    print("Mean squared error =", round(mse, 2))
    print("Explained variance score = ", round(evs, 2))

    # AdaBoost算法改善后的效果
    y_pred_ab = ab_regressor.predict(x_test)
    mse = mean_squared_error(y_test, y_pred_ab)
    evs = explained_variance_score(y_test, y_pred_ab)
    print("\n###  Decision Tree performance ####")
    print("Mean squared error =", round(mse, 2))
    print("Explained variance score = ", round(evs, 2))

    # 画出特征的相对重要性
    #plt.figure(figsize=(10, 8), dpi=100)  # 指定尺寸和分辨率
    plot_feature_importances(dt_regressor.feature_importances_,
                             'Decision Tree regressor',
                             housing_data.feature_names, 1)
    plot_feature_importances(ab_regressor.feature_importances_,
                             'AdaBoost regressor', housing_data.feature_names,
                             2)
예제 #23
0
def ada_boost_regressor(train_x,
                        train_y,
                        pred_x,
                        review_id,
                        v_curve=False,
                        l_curve=False,
                        get_model=True):
    """
    :param train_x: train
    :param train_y: text
    :param pred_x: test set to predict
    :param review_id: takes in a review id
    :param v_curve: run the model for validation curve
    :param l_curve: run the model for learning curve
    :param get_model: run the model
    :return: the predicted values,learning curve, validation curve
    """
    ada = AdaBoostRegressor(n_estimators=5)
    if get_model:
        print "Fitting Ada..."
        ada.fit(train_x, np.log(train_y + 1))
        ada_pred = np.exp(ada.predict(pred_x)) - 1
        Votes = ada_pred[:, np.newaxis]
        Id = np.array(review_id)[:, np.newaxis]
        # create submission csv for Kaggle
        submission_ada = np.concatenate((Id, Votes), axis=1)
        np.savetxt("submission_ada.csv",
                   submission_ada,
                   header="Id,Votes",
                   delimiter=',',
                   fmt="%s, %0.2f",
                   comments='')
    # plot validation and learning curves
    if l_curve:
        print "Working on Learning Curves"
        plot_learning_curve(AdaBoostRegressor(), "Learning curve: Adaboost",
                            train_x, np.log(train_y + 1.0))
    if v_curve:
        print "Working on Validation Curves"
        plot_validation_curve(AdaBoostRegressor(),
                              "Validation Curve: Adaboost",
                              train_x,
                              np.log(train_y + 1.0),
                              param_name="n_estimators",
                              param_range=[2, 5, 10, 15, 20, 25, 30])
예제 #24
0
def Bayesian(dataTrain, dataTest, TestCol, outputpath):
    modelName = 'RandomForestRegress'
    s = datetime.datetime.now()

    X_train = dataTrain.drop('Rain', axis=1)
    y_train = dataTrain['Rain']  # .loc[dataTrain['Rain'] >= 0, 'Rain']

    X_test = dataTest.drop('Rain', axis=1)
    y_test = dataTest['Rain']  # .loc[dataTest['Rain'] >= 0, 'Rain']

    X_train_Nodate = X_train[TestCol]
    X_test_Nodeate = X_test[TestCol]

    model = AdaBoostRegressor(linear_model.BayesianRidge(), n_estimators=300)
    model.fit(X_train_Nodate, y_train)
    y_predict = model.predict(X_test_Nodeate)

    print("Baysian performance:")
    MAE = round(sm.mean_absolute_error(y_test, y_predict), 2)
    MSE = round(sm.mean_squared_error(y_test, y_predict), 2)
    MedienAE = round(sm.median_absolute_error(y_test, y_predict), 2)
    R2score = round(sm.r2_score(y_test, y_predict), 2)
    print("Mean absolute error =", MAE)
    print("Mean squared error =", MSE)
    print("Median absolute error =", MedienAE)
    print("Explained variance score =",
          round(sm.explained_variance_score(y_test, y_predict), 2))
    print("R2 score =", R2score)

    y_pred = numpy.round(y_predict, 1)
    X_test.loc[:, 'Predict_Rain'] = pandas.Series(y_pred, index=X_test.index)

    outFile = pandas.DataFrame(X_test[['Date', 'Lat', 'Long', 'Predict_Rain']])
    # print(outFile)
    # print(dataTrain.loc[dataTrain['Rain'] > 0, 'Rain'])
    outFile.to_csv(outputpath + 'Result_predict_{0}.csv'.format(modelName),
                   index=False)
    text = open(outputpath + 'Result_{0}.txt'.format(modelName), mode='a')
    text.write("Mean absolute error ={0}\n".format(MAE))
    text.write("Mean squared error ={0}\n".format(MSE))
    text.write("Median absolute error ={0}\n".format(MedienAE))
    text.write("R2 score ={0}\n".format(R2score))
    e = datetime.datetime.now()
    text.write("Total Time:{0}".format(e - s))
    text.close()
예제 #25
0
def test_loansum_predict_0_1(trainpath ,trainData,goalpath ,goalfile):
    print(str(trainData))
    trainData = pd.read_csv(trainpath+trainData)

    X_feature = np.array(trainData.iloc[:, 0:4])
    X_goal = np.array(trainData.iloc[:, 4:5])

    x_train,x_test,y_train,y_test=train_test_split(X_feature,X_goal,train_size=0.7)
    y_train=y_train.reshape(y_train.shape[0],)
    maxdepth =500
    assert y_train.shape[0]==x_train.shape[0] ," 怎么搞的"
    rng = np. random.RandomState(1)
    regAdaBoost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=maxdepth), n_estimators=1000, random_state=rng)
    regRandomForest = RandomForestRegressor(max_depth=maxdepth, n_estimators=1000, random_state=rng)
    regGrad= GradientBoostingRegressor(loss="ls",n_estimators=1000, max_depth=maxdepth )

    regAdaBoost.fit(x_train[:,1:4], y_train)
    regRandomForest.fit(x_train[:,1:4], y_train)
    regGrad.fit(x_train[:,1:4], y_train)

    y1 = regAdaBoost.predict(x_test[:,1:4])
    y2 = regRandomForest.predict(x_test[:,1:4])
    y3=regGrad.predict(x_test[:,1:4])

    y123=[y1,y2,y3]
    doc=["Ada","Ran","Grad"]

    for y,d in zip(y123,doc):
        ysum = 0
        for i in range(x_test.shape[0]):
            ysum=ysum+np.power(y[i]-y_test[i,0],2)
        print("{0}'s RMSE :{1}".format(d,math.sqrt(ysum/x_test.shape[0])))
    meansum = 0
    for i in range(x_test.shape[0]):
        meansum = meansum + np.power((x_test[i, 1] + x_test[i, 2] + x_test[i, 3]) / 3 - y_test[i, 0], 2)
    print("meansum's RMSE:{0}".format(math.sqrt(meansum/x_test.shape[0])))
    print()
    pduid=pd.DataFrame(x_test[:,0],columns=["uid"])
    pd1=pd.DataFrame(y_test, columns=["goal"])
    pd2=pd.DataFrame(y1, columns=["AdaBoost_pre"])
    pd3=pd.DataFrame(y2, columns=["RandomFrost_pre"])
    pd4=pd.DataFrame(y3, columns=["Grad"])
    pdd = pd.concat([pduid,pd1, pd2, pd3,pd4], axis=1)
    pdd.to_csv(goalpath+goalfile, index=False)
예제 #26
0
def AdaBoostRegr(train,test, labels, ground_truth):

    params = {'n_estimators':[50,100], 'learning_rate':[0.01,0.05,0.1,0.3,1],
            'loss':['linear','square','exponential']}

    #USE CV to find opt parameters
    #adaBoostRS = RandomizedSearchCV(AdaBoostRegressor(DecisionTreeRegressor()), param_distributions=params,
            #cv = 10, n_iter=50, n_jobs=-1)
    adaBoostRS = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators=50)
    adaBoostRS = adaBoostRS.fit(train, labels)
    print(adaBoostRS.feature_importances_)
    print(labels)
    predictions = adaBoostRS.predict(test)
    score = RegPrediction(ground_truth, predictions)




    return score
예제 #27
0
def recspre(exstr, predata, datadict, zhe, count=100):
    tree, te = exstr.split('-')
    model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=int(te)),
                              n_estimators=int(tree),
                              learning_rate=0.8)
    model.fit(datadict[zhe]['train'][:, :-1], datadict[zhe]['train'][:, -1])

    # 预测
    yucede = model.predict(predata[:, :-1])
    # 为了便于展示,选100条数据进行展示
    zongleng = np.arange(len(yucede))
    randomnum = np.random.choice(zongleng, count, replace=False)

    yucede_se = list(np.array(yucede)[randomnum])

    yuce_re = list(np.array(predata[:, -1])[randomnum])

    # 对比
    plt.figure(figsize=(17, 9))
    plt.subplot(2, 1, 1)
    plt.plot(list(range(len(yucede_se))), yucede_se, 'r--', label='预测', lw=2)
    plt.scatter(list(range(len(yuce_re))),
                yuce_re,
                c='b',
                marker='.',
                label='真实',
                lw=2)
    plt.xlim(-1, count + 1)
    plt.legend()
    plt.title('预测和真实值对比[最大树数%d]' % int(tree))

    plt.subplot(2, 1, 2)
    plt.plot(list(range(len(yucede_se))),
             np.array(yuce_re) - np.array(yucede_se),
             'k--',
             marker='s',
             label='真实-预测',
             lw=2)
    plt.legend()
    plt.title('预测和真实值相对误差')

    plt.savefig(r'C:\Users\GWT9\Desktop\duibi.jpg')
    return '预测真实对比完毕'
예제 #28
0
def round1(X, y):
    # Set parameters
    model = AdaBoostRegressor()
    n = len(y)

    # Perform 5-fold cross validation
    scores = []
    kf = KFold(n, n_folds=5, shuffle=True)

    # Calculate mean absolute deviation for train/test for each fold
    for train_idx, test_idx in kf:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, prediction))
        scores.append(rmse)

    return scores
예제 #29
0
파일: calc_svm_imf.py 프로젝트: fndjjx/emd
def svm_smooth(data, residual_imf, period):
    train_data = []
    lable = []
    for i in range(period,len(residual_imf)-20):
        tmp = data[i-period:i+1]
        train_data.append(tmp)
        lable.append(residual_imf[i])

    rng = np.random.RandomState(1)
    clf = AdaBoostRegressor(svm.SVR(),n_estimators=1, random_state=rng)
    clf.fit(train_data, lable) 
    smooth_data = []
    for i in range(len(data)):
        if i<=period:
            smooth_data.append(data[i])
        else:
            smooth_data.append(clf.predict([data[i-period:i+1]])[0])

    return smooth_data
예제 #30
0
파일: code.py 프로젝트: nilichen/ML4Trading
def train_model(training, testing, window=5, n=5):
	X_train, y_train = prepare_data(training)
	X_test, y_test = prepare_data(testing)
	rf = RandomForestRegressor()
	rf.fit(X_train, y_train)
	predrf = rf.predict(X_test)
	print "mse for random forest regressor: ", mean_squared_error(predrf, y_test)

	gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.025)
	gb.fit(X_train, y_train)
	predgb = gb.predict(X_test)
	print "mse for gradient boosting regressor: ", mean_squared_error(predgb, y_test)
	## plot feature importance using GBR results
	fx_imp = pd.Series(gb.feature_importances_, index=['bb', 'momentum', 'sma', 'volatility'])
	fx_imp /= fx_imp.max()  # normalize
	fx_imp.sort()
	ax = fx_imp.plot(kind='barh')
	fig = ax.get_figure()
	fig.savefig("output/feature_importance.png")

	adb = AdaBoostRegressor(DecisionTreeRegressor())
	adb.fit(X_train, y_train)
	predadb = adb.predict(X_test)
	print "mse for adaboosting decision tree regressor: ", mean_squared_error(predadb, y_test)

	scale = StandardScaler()
	scale.fit(X_train)
	X_trainscale = scale.transform(X_train)
	X_testscale = scale.transform(X_test)

	knn = BaggingRegressor(KNeighborsRegressor(n_neighbors=10), max_samples=0.5, max_features=0.5)
	knn.fit(X_trainscale, y_train)
	predknn = knn.predict(X_testscale)
	print "mse for bagging knn regressor: ", mean_squared_error(predknn, y_test)

	pred_test = 0.1*predrf+0.2*predgb+0.1*predadb+0.6*predknn
	print "mse for ensemble all the regressors: ", mean_squared_error(pred_test, y_test)
	result = testing.copy()
	result.ix[5:-5, 'trend'] = pred_test
	result.ix[10:, 'pred'] = pred_test * result.ix[5:-5, 'IBM'].values
	result.ix[:-5, 'pred_date'] = result.index[5:]

	return result
예제 #31
0
def boosting(X, y, k_cv):
    kfold = KFold(n_splits=k_cv, shuffle=True, random_state=0)
    regr = AdaBoostRegressor(base_estimator=SVR(C=40, gamma=0.01),
                             random_state=319,
                             n_estimators=40,
                             learning_rate=0.01,
                             loss="square")
    vaild_split = kfold.split(y)
    for i in range(k_cv):
        split_index = vaild_split.__next__()
        test_index = split_index[1]
        y_test = y[test_index]
        trainval_index = split_index[0]
        X_trainval = X[trainval_index, :]
        X_test = X[test_index, :]
        y_trainval = y[trainval_index]
        regr.fit(X_trainval, y_trainval)
        print((regr.score(X_trainval, y_trainval))**0.5)
        test_pre = regr.predict(X_test)
        print("accuracy: ", (r_2(y_test, test_pre))**0.5)
예제 #32
0
class gaussProcess_classifier(Classifier):
    def __init__(self,
                 ticker,
                 inputSize=5,
                 binary=True,
                 risk=0.5,
                 numTrainDays=300,
                 adaboost=False):
        self.type = 'Gaussian Process'
        self.ticker = ticker
        self.days = inputSize
        self.inputSize = inputSize
        self.binary = binary
        self.risk_thresh = risk
        self.adaboost = adaboost
        self.numTrainDays = numTrainDays
        if binary:
            self.clf = GaussianProcessClassifier()
        else:
            self.clf = GaussianProcessRegressor()
            if adaboost:
                self.clf = AdaBoostRegressor(base_estimator=self.clf,
                                             n_estimators=100)

    def trainClf(self, endDay=date.today(), numTrainDays=100):
        X, Y = self.processData(endDay, self.numTrainDays)
        self.fit(X, Y)

    def predict(self, inputArray):
        inputArray = np.array(inputArray)
        inputArray.reshape([1, -1])

        if self.binary:
            pred = self.clf.predict_proba(inputArray)
            pred = (np.array(pred)[:, 1] > self.risk_thresh) * 1
        else:
            pred = self.clf.predict(inputArray)
        return pred

    def fit(self, X, Y):
        self.clf.fit(X, Y)
예제 #33
0
def adaBoost(X_train, X_test, y_train, y_test, Xscaler, yscaler):

    # Fit regression model
    rng = np.random.RandomState(1)
    regr_1 = DecisionTreeRegressor(max_depth=40)

    regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=40),
                               n_estimators=3000,
                               random_state=rng)

    regr_1.fit(X_train, y_train)
    regr_2.fit(X_train, y_train)

    # Predict
    y_1 = regr_1.predict(X_test)
    y_2 = regr_2.predict(X_test)

    y_1 = yscaler.inverse_transform(np.array(y_1).reshape(-1, 1))
    y_2 = yscaler.inverse_transform(np.array(y_2).reshape(-1, 1))

    return y_2
def predict_solution(X_train, y_train, X_test):
    """
    This method uses prepared data and AdaBoost including Decision tree to generate
    predictions from the given test set.
    """

    # Normalize data
    scaler = MinMaxScaler()
    scaler.fit(X_train.values)
    X_train = scaler.transform(X_train.values)
    X_test = scaler.transform(X_test.values)

    # Build and fit chosen model
    model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(
        max_depth=30, criterion='friedman_mse'),
                              n_estimators=50,
                              learning_rate=0.5,
                              loss='square')
    model.fit(X_train, y_train)

    return model.predict(X_test)
예제 #35
0
class k_meansCluster(Classifier):
    def __init__(self,ticker,inputSize=5, binary=True, adaboost=False):
        self.type = 'K-Means'
        self.ticker=ticker
        self.days=inputSize
        self.inputSize = inputSize
        self.binary=binary
        self.adaboost = adaboost
        self.clf = KMeans(n_clusters=2, random_state=0)
        if self.adaboost: self.clf=AdaBoostRegressor(base_estimator=self.clf, n_estimators=100)
    

    def predict(self, inputArray):
        inputArray = np.array(inputArray)
        inputArray.reshape([1,-1])
        pred = self.clf.predict(inputArray)
        return pred


    def fit(self, X, Y):
        self.clf.fit(X,Y)
def test_my_select_feature_train21_23():
    filename = ("train_data_loan11_21.csv", "train_data_loan11_22.csv",
                "train_data_loan11_23.csv")
    maxfeaturenum = 20
    #
    trainData = np.zeros((1, maxfeaturenum + 2))
    print(trainData.shape)
    # print(trainData.shape)
    path = "./temporaryData/"
    filename21 = ("train_data_loan11_21.csv", )
    for fn in filename:
        Datarray, Datauid = feature_Select_del_similar(path, filename=fn)
        trainDatatem = np.array(np.concatenate([Datauid, Datarray], axis=1))
        trainData = np.concatenate([trainData, trainDatatem])
    trainData = np.delete(trainData, 0, 0)
    print(trainData.shape)

    X_train, X_test, y_train, y_test = train_test_split(trainData[:, 2:],
                                                        trainData[:, 0:2],
                                                        test_size=0.3)
    print(X_train.shape)
    print(y_test.shape)
    print(y_train[:, 1])
    maxdepth = 20
    rng = np.random.RandomState(1)
    reg2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=maxdepth),
                             n_estimators=300,
                             random_state=rng)
    reg2.fit(X_train, y_train[:, 1])

    y2 = reg2.predict(X_test)

    columns = []
    for num1 in range(maxfeaturenum):
        columns.append(str("loan_amount%d" % num1))

    pd1 = pd.DataFrame(y_test, columns=["uid", "goal"])
    pd2 = pd.DataFrame(y2, columns=["pre"])
    pdd = pd.concat([pd1, pd2], axis=1)
    pdd.to_csv(path + "train_data_loan11_21_fs.csv", index=False)
예제 #37
0
def xgb_train(x_train, x_label, x_test):
    model = 'xgb'
    #model = 'adaboost'
    #if model.count('xgb') >0:
    params = {}
    params["objective"] = "reg:linear"
    params["eta"] = 0.005  # [0,1]
    params["min_child_weight"] = 6
    params["subsample"] = 0.7
    params["colsample_bytree"] = 0.7
    params["scale_pos_weight"] = 1.0
    params["silent"] = 1
    params["max_depth"] = 9
    if config.nthread > 1:
        params["nthread"] = 1

    num_rounds = 10000

    xgtrain = xgb.DMatrix(x_train, label=x_label)
    xgval = xgb.DMatrix(x_test)

    #train using early stopping and predict
    watchlist = [(xgtrain, "train")]
    #model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=120, feval=gini_metric)
    model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=120)
    pred1 = model.predict( xgval )

    #clf = RandomForestRegressor()
    #clf = LogisticRegression()
    #clf = GradientBoostingRegressor()
    clf = AdaBoostRegressor( ExtraTreesRegressor(max_depth=9), n_estimators=200 )
    clf.fit(x_train, x_label)
    pred2 = clf.predict(x_test)

    #pred = pred1 * pred2 / (pred1 + pred2)
    #pred = 0.7 * (pred1**0.01) + 0.3 * (pred2**0.01)
    #pred = (pred1.argsort() + pred2.argsort()) / 2
    pred = 0.6 * pred1 + 0.4 * pred2

    return pred
예제 #38
0
def train_and_predict_adab_stacked_gbr (train, labels, test, feature_names = None) :
	" Attmept with SVR ... "
	print ("Training ADABoost with GBR as base model")
	t0 = time.clock()
	if (gridSearch) :
		params_dict = {'adab__learning_rate' : [0.1, 0.3]} 
		#model = GridSearchCV(regr, params_dict, n_jobs = 3, cv = kfObject, verbose = 10, scoring = 'mean_squared_error')
	else :
		base =  GradientBoostingRegressor(random_state = randomState, learning_rate = 0.1, n_estimators = 1500, max_depth = 6, subsample = 0.95, max_features = 1, verbose = 10)
		model = AdaBoostRegressor(random_state = randomState, base_estimator = base, n_estimators = 3, learning_rate = 0.005)

	model.fit(train, labels)
	print ("Model fit completed in %.3f sec " %(time.clock() - t0))

	if (gridSearch) :
		print ("Best estimator: ", model.best_estimator_)
		print ("Best MSLE scores: %.4f" %(model.best_score_))
		print ("Best RMSLE score: %.4f" %(math.sqrt(-model.best_score_)))
	else :
		float_formatter = lambda x: "%.4f" %(x)
		print ("Feature importances: ", sorted(zip([float_formatter(x) for x in model.feature_importances_], feature_names), reverse=True))
	
	return model.predict(test)
def test_regression_toy():
    # Check classification on a toy dataset.
    clf = AdaBoostRegressor(random_state=0)
    clf.fit(X, y_regr)
    assert_array_equal(clf.predict(T), y_t_regr)
X, y = shuffle(boston.data, boston.target)
offset = int(0.7*len(X))
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]

# We will vary the number of base learners from 2 to 300
max_learners = arange(2, 300)
train_err = zeros(len(max_learners))
test_err = zeros(len(max_learners))

for i, l in enumerate(max_learners):
	# Set up a Adaboost Regression Learner with l base learners
    regressor = AdaBoostRegressor(n_estimators=l)

    # Fit the learner to the training data
    regressor.fit(X_train, y_train)

    # Find the MSE on the training set
    train_err[i] = mean_squared_error(y_train, regressor.predict(X_train))
    # Find the MSE on the testing set
    test_err[i] = mean_squared_error(y_test, regressor.predict(X_test))

# Plot training and test error as a function of the number of base learners
pl.figure()
pl.title('Boosting: Performance vs Number of Learners')
pl.plot(max_learners, test_err, lw=2, label = 'test error')
pl.plot(max_learners, train_err, lw=2, label = 'training error')
pl.legend()
pl.xlabel('Number of Learners')
pl.ylabel('RMS Error')
pl.show()
예제 #41
0
def test_regression_toy():
    """Check classification on a toy dataset."""
    clf = AdaBoostRegressor()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    month=i%12
    Train_X.append([month,i//12,1 if month==0 else 0,
                   1 if month==1 else 0,1 if month==2 else 0,
                   1 if month==3 else 0,1 if month==4 else 0,
                   1 if month==5 else 0,1 if month==6 else 0,
                   1 if month==7 else 0,1 if month==8 else 0,
                   1 if month==9 else 0,1 if month==10 else 0,
                   1 if month==11 else 0,1 if month==12 else 0])




Test_X=Train_X[-12:]
Train_X=Train_X[:-12]
clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 3), n_estimators = 37, learning_rate = 2).fit(Train_X,Train_Y)
Test_Y=clf.predict(Test_X)-90000


if local:
    filename = "SampleOutput.txt"
    f = open(filename)
    dtot=0.0
    k=0
    for x in Test_Y:
        actual=int(f.readline())
        d=(abs(actual-int(x))/float(actual))/12
        print int(x),actual,d
        dtot+=d
    print 2.5*max(40-(dtot*100),0)
else:
    for x in Test_Y:
예제 #43
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 25 17:19:04 2017

@author: carrey
"""

import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
import matplotlib.pyplot as plt
from feature_format import feature_format

x_train, x_test, y_train, y_test = feature_format(task = 2)

rng = np.random.RandomState(1)
regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 21),
                         n_estimators=11, random_state = rng)

regr.fit(x_train, y_train)

y_pred = regr.predict(x_test)

mape = np.mean(np.abs((y_pred - y_test)/y_test))

print x_test
print mape

print y_pred,y_test
rng = np.random.RandomState(1)
X = np.linspace(0, 6, 100)[:, np.newaxis]
y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])

# Fit regression model
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

clf_1 = DecisionTreeRegressor(max_depth=4)

clf_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                          n_estimators=300, random_state=rng)

clf_1.fit(X, y)
clf_2.fit(X, y)

# Predict
y_1 = clf_1.predict(X)
y_2 = clf_2.predict(X)

# Plot the results
plt.figure()
plt.scatter(X, y, c="k", label="training samples")
plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2)
plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Boosted Decision Tree Regression")
plt.legend()
plt.show()
예제 #45
0
def decision_tree(X, y1, y2, y3):
  n, _ = X.shape
  nTrain = int(0.5*n)  #training on 50% of the data
  Xtrain = X[:nTrain,:]
  ytrain = y1[:nTrain]
  ytrain_registered = y2[:nTrain]
  ytest_registered = y2[nTrain:]
  ytrain_casual = y3[:nTrain]
  ytest_casual = y3[nTrain:]
  Xtest = X[nTrain:,:]
  ytest = y1[nTrain:]

  #regular

  clf_1 = DecisionTreeRegressor(max_depth=None)
  clf_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None),
                          n_estimators=500)
  clf_4 = RandomForestRegressor(n_estimators=500, max_depth=None,
                          min_samples_split=1, random_state=0)
  clf_5 = ExtraTreesRegressor(n_estimators=500, max_depth=None,
                          min_samples_split=1, random_state=0)
  clf_3 = GradientBoostingRegressor(n_estimators=500,
                          max_depth=None, random_state=0)

  print "finished generating tree"

  clf_1.fit(Xtrain, ytrain_registered)
  clf_2.fit(Xtrain, ytrain_registered)
  clf_3.fit(Xtrain, ytrain_registered)
  clf_4.fit(Xtrain, ytrain_registered)
  clf_5.fit(Xtrain, ytrain_registered)


  print 'Finished fitting'


  dt_regular = clf_1.predict(Xtest)
  ada_regular = clf_2.predict(Xtest)
  grad_regular = clf_3.predict(Xtest)
  rf_regular = clf_4.predict(Xtest)
  et_regular = clf_5.predict(Xtest)

  #casual
  print "finished generating tree"

  clf_1.fit(Xtrain, ytrain_casual)
  clf_2.fit(Xtrain, ytrain_casual)
  clf_3.fit(Xtrain, ytrain_casual)
  clf_4.fit(Xtrain, ytrain_casual)
  clf_5.fit(Xtrain, ytrain_casual)


  print 'Finished fitting'


  dt_casual = clf_1.predict(Xtest)
  ada_casual = clf_2.predict(Xtest)
  grad_casual = clf_3.predict(Xtest)
  rf_casual = clf_4.predict(Xtest)
  et_casual = clf_5.predict(Xtest)
  feature_imps = clf_4.feature_importances_

  print "regular decision tree"
  print rmsle(ytest, dt_regular + dt_casual)
  print "boosted decision tree"
  print rmsle(ytest, ada_regular + ada_casual)
  print "gradient tree boosting"
  print rmsle(ytest, grad_regular + grad_casual)
  print "random forest classifier"
  print rmsle(ytest, rf_regular + rf_casual)
  print "extra trees classifier"
  print rmsle(ytest, et_casual + et_regular)

  print "feature importances"
  print feature_imps
        if s == current:
            # test
            X_predict.append(v_features[id])
            Y_predict.append(v_map[id])
        else:
            # train
            X_train.append(v_features[id])
            Y_train.append(v_map[id])
    assert len(X_train) == len(Y_train)
    assert len(X_predict) == len(Y_predict)
    X_train = np.array(X_train)
    Y_train = np.array(Y_train)
    X_predict = np.array(X_predict)
    Y_predict = np.array(Y_predict)

    # X_train = scale(X_train, axis=0)
    # X_predict = scale(X_predict, axis=0)

    regr = AdaBoostRegressor(n_estimators=150, learning_rate=0.1)
    # regr = SVR(C=0.02, epsilon=0.5)

    regr.fit(X_train, Y_train)
    y = regr.predict(X_predict)
    scores_mse[current] = mean_squared_error(Y_predict, y)
    scores_r[current] = pearsonr(Y_predict, y)[0]

# print(sum(scores) / 5)
print(scores_mse, sum(scores_mse) / 5)
print(scores_r, sum(scores_r) / 5)

예제 #47
0
### visualization code (prettyPicture) to show you the decision boundary

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score


# abc = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0,
#                         algorithm='SAMME.R', random_state=None)
#
# abc.fit(features_train, labels_train)
# predicted = abc.predict(features_test)
# accuracy = accuracy_score(labels_test, predicted)
# print accuracy


abr = AdaBoostRegressor(base_estimator=None, n_estimators=500, learning_rate=1.0,
                  loss='linear', random_state=None)
abr.fit(features_train, labels_train)
predicted_test = abr.predict(features_test)
test_score = r2_score(labels_test, predicted_test)


print test_score


try:
    prettyPicture(abr, features_test, labels_test)
except NameError:
    pass