class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = AdaBoostRegressor(RandomForestRegressor(n_estimators=100, max_depth=40, max_features=25), n_estimators=100)
        #self.clf_Boost = GradientBoostingRegressor( n_estimators = 500 , max_features = 20 )
        #self.clf_Regression = LinearRegression()
        

    def fit(self, X, y):
        self.clf.fit(X,y)

    def predict(self, X):
        return self.clf.predict(X)
示例#2
0
    def fit(self, start_date, end_date):

        for ticker in self.tickers:
            self.stocks[ticker] = Stock(ticker)

        params_ada = [{
            'n_estimators': [25, 50, 100],
            'learning_rate': [0.01, 0.1, 1, 10],
            'loss': ['linear', 'square', 'exponential']
            }]

        params = ParameterGrid(params_ada)

        # Find the split for training and CV
        mid_date = train_test_split(start_date, end_date)
        for ticker, stock in self.stocks.items():

            X_train, y_train = stock.get_data(start_date, mid_date, fit=True)
            # X_train = self.pca.fit_transform(X_train.values)
            X_train = X_train.values
            # pdb.set_trace()
            X_cv, y_cv = stock.get_data(mid_date, end_date)
            # X_cv = self.pca.transform(X_cv.values)
            X_cv = X_cv.values

            lowest_mse = np.inf
            for i, param in enumerate(params):
                ada = AdaBoostRegressor(**param)
                ada.fit(X_train, y_train.values)
                mse = mean_squared_error(
                    y_cv, ada.predict(X_cv))
                if mse <= lowest_mse:
                    self.models[ticker] = ada

        return self
def Round2(X, y):
    # Set parameters
    min_score = {}
    for loss in ['linear', 'square', 'exponential']:
        model = AdaBoostRegressor(loss=loss)
        n = len(y)

        # Perform 5-fold cross validation
        scores = []
        kf = KFold(n, n_folds=5, shuffle=True)

        # Calculate mean absolute deviation for train/test for each fold
        for train_idx, test_idx in kf:
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            model.fit(X_train, y_train)
            prediction = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, prediction))
            # score = model.score(X_test, y_test)
            scores.append(rmse)
        if len(min_score) == 0:
            min_score['loss'] = loss
            min_score['scores'] = scores
        else:
            if np.mean(scores) < np.mean(min_score['scores']):
                min_score['loss'] = loss
                min_score['scores'] = scores

        print "Loss:", loss
        print scores
        print np.mean(scores)
    return min_score
def round2(X_df, featurelist):
    # Set parameters
    model = AdaBoostRegressor()
    y_df = X_df['target']
    n = len(y_df)

    # Perform 5-fold cross validation
    scores = []
    kf = KFold(n, n_folds=5, shuffle=True)

    # Calculate mean absolute deviation for train/test for each fold
    for train_idx, test_idx in kf:
        X_train, X_test = X_df.iloc[train_idx, :], X_df.iloc[test_idx, :]
        # y_train, y_test = y_df[train_idx], y_df[test_idx]

        X_train, X_test = applyFeatures(X_train, X_test, featurelist)
        Xtrain_array, ytrain_array, Xtest_array, ytest_array = dfToArray(X_train, X_test)
        model.fit(Xtrain_array, ytrain_array)
        prediction = model.predict(Xtest_array)
        rmse = np.sqrt(mean_squared_error(ytest_array, prediction))
        scores.append(rmse)
        print rmse
        print "Finish fold"

    return scores
def ada_boost_regressor(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True):
    """
    :param train_x: train
    :param train_y: text
    :param pred_x: test set to predict
    :param review_id: takes in a review id
    :param v_curve: run the model for validation curve
    :param l_curve: run the model for learning curve
    :param get_model: run the model
    :return: the predicted values,learning curve, validation curve
    """
    ada = AdaBoostRegressor(n_estimators=5)
    if get_model:
        print "Fitting Ada..."
        ada.fit(train_x, np.log(train_y+1))
        ada_pred = np.exp(ada.predict(pred_x))-1
        Votes = ada_pred[:,np.newaxis]
        Id = np.array(review_id)[:,np.newaxis]
        # create submission csv for Kaggle
        submission_ada= np.concatenate((Id,Votes),axis=1)
        np.savetxt("submission_ada.csv", submission_ada,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='')
    # plot validation and learning curves
    if l_curve:
        print "Working on Learning Curves"
        plot_learning_curve(AdaBoostRegressor(), "Learning curve: Adaboost", train_x, np.log(train_y+1.0))
    if v_curve:
        print "Working on Validation Curves"
        plot_validation_curve(AdaBoostRegressor(), "Validation Curve: Adaboost", train_x, np.log(train_y+1.0),
                              param_name="n_estimators", param_range=[2, 5, 10, 15, 20, 25, 30])
def train_learning_model_decision_tree_ada_boost(df):
    #code taken from sklearn
    X_all, y_all = preprocess_data(df)
    X_train, X_test, y_train, y_test = split_data(X_all, y_all)

    tree_regressor = DecisionTreeRegressor(max_depth = 6)
    ada_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators = 500, learning_rate = 0.01, random_state = 1)

    tree_regressor.fit(X_train, y_train)
    ada_regressor.fit(X_train, y_train)

    y_pred_tree = tree_regressor.predict(X_test)
    y_pred_ada = ada_regressor.predict(X_test)
    
    mse_tree = mean_squared_error(y_test, y_pred_tree)
    mse_ada = mean_squared_error(y_test, y_pred_ada)

    mse_tree_train = mean_squared_error(y_train, tree_regressor.predict(X_train))
    mse_ada_train = mean_squared_error(y_train, ada_regressor.predict(X_train))
    
    print ("MSE tree: %.4f " %mse_tree)
    print ("MSE ada: %.4f " %mse_ada)

    print ("MSE tree train: %.4f " %mse_tree_train)
    print ("MSE ada train: %.4f " %mse_ada_train)
示例#7
0
文件: x.py 项目: shenbai/tradesafe
def backTest(trainEndDate, code, testDate, predictDate):
    conn = db.get_history_data_db('D')
    df = None
    # train more date
    # model = pickle.load(open('%s/%s.pkl' % (config.model_dir, code), 'r'))
    rng = np.random.RandomState(1)
    model = AdaBoostRegressor(DecisionTreeRegressor(
        max_depth=4), n_estimators=1000, random_state=rng, loss='square')
    df = pd.read_sql_query(
        "select * from history_data where date([date])<='%s' and code='%s' order by code, date([date]) asc" % (
            trainEndDate, code), conn)
    shift_1 = df['close'].shift(-2)
    df['target'] = shift_1
    data = df[df['target'] > -1000]

    X_train = data.ix[:, 'code':'turnover']
    y_train = data.ix[:, 'target']
    if len(X_train) < 500:
        return
    print len(X_train)
    # print data
    # for i in range(0, 10):
    #     model.fit(X_train, y_train)
    model.fit(X_train, y_train)
    # predict tomorrow
    try:
        df = pd.read_sql_query(config.sql_history_data_by_code_date % (code, testDate), conn)
        # print df
    except Exception, e:
        print e
示例#8
0
def main():



    ab = AdaBoostRegressor(base_estimator=None, n_estimators=50, 
                            learning_rate=1.0, loss='exponential', 
                            random_state=None)  

    ab.fit(X_train, y_train)

    #Evaluation in train set
    #Evaluation in train set
    pred_proba_train = ab.predict(X_train)
        
    mse_train = mean_squared_error(y_train, pred_proba_train)
    rmse_train = np.sqrt(mse_train)
    logloss_train = log_loss(y_train, pred_proba_train)
    
    #Evaluation in validation set
    pred_proba_val = ab.predict(X_val)
        
    mse_val = mean_squared_error(y_val, pred_proba_val)
    rmse_val = np.sqrt(mse_val)
    logloss_val = log_loss(y_val, pred_proba_val)
    
    rmse_train
    rmse_val
    logloss_train
    logloss_val
def predict_volatility_1year_ahead(rows, day, num_days):
    """
    SUMMARY: Predict volatility 1 year into the future
    ALGORITHM:
      a) The predictor will train on all data up to exactly 1 year (252 trading days) before `day`
      b) The newest 10 days up to and including `day` will be used as the feature vector for the prediction
         i.e. if day = 0, then the feature vector for prediction will consist of days (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
              if day = 10, then the feature vector for predictor input will be days (10, 11, 12, 13, 14, 15, 16, 17, 19)
    INPUT: minimum of (1 year + 10 days) of data before `day` (newest data is day=0)
  
    """

    '''enforce that `day` is in the required range'''
    assert len(rows) >= 252+num_days + day, 'You need to have AT LEAST 252+%d rows AFTER the day index. See predict_volatility_1year_ahead() for details.' % num_days
    assert day >= 0

    '''Compile features for fitting'''
    feature_sets = []
    value_sets = [];
    for ii in range(day+num_days+252, len(rows) - num_days):
        features = []
        for jj in range(num_days):
            day_index = ii + jj
            features += [
        	float(rows[day_index][7]), 
        	float(rows[day_index][8]),
        	float(rows[day_index][9]), 
        	float(rows[day_index][10]),
        	float(rows[day_index][11]),
        	float(rows[day_index][12]),
        	float(rows[day_index][13]),
            ]
            #print("issue here: " + str(rows[day_index][0]))
        feature_sets += [features]
        value_sets += [float(rows[ii-252][9])]

    '''Create Regressor and fit'''
    num_features = 16
    rng = np.random.RandomState(1)
    regr = AdaBoostRegressor(CustomClassifier(), n_estimators=3, random_state=rng)
    regr.fit(feature_sets, value_sets)

    '''Get prediction features'''
    ii = day
    features = []
    for jj in range( num_days ):
        day_index = ii + jj   
        features += [
        float(rows[day_index][7]), 
        float(rows[day_index][8]),
        float(rows[day_index][9]), 
        float(rows[day_index][10]),
        float(rows[day_index][11]),
        float(rows[day_index][12]),
        float(rows[day_index][13]),
        ]
        
    return float(regr.predict([features]))
示例#10
0
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = AdaBoostRegressor(RandomForestRegressor(n_estimators=500, max_depth=78, max_features=10), n_estimators=40)

    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)
示例#11
0
def train_predict(train_id, test_id):
	# load libsvm files for training dataset
	Xs_train = []
	ys_train = []
	n_train = load_libsvm_files(train_id, Xs_train, ys_train)
	# load libsvm files for testing dataset
	Xs_test = []
	ys_test = []
	n_test = load_libsvm_files(test_id, Xs_test, ys_test)

	# models
	model = []

	# ans
	ans_train = []
	ans_test = []

	# generate predictions for training dataset
	ps_train = []
	for i in range(0, n_train):
		ps_train.append([0.0 for j in range(10)])

	# generate predictions for testing dataset
	ps_test = []
	for i in range(0, n_test):
		ps_test.append([0.0 for j in range(10)])

	# fit models
	for i in range(10):
		l = np.array([ys_train[j][i] for j in range(n_train)])
		clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=params['max_depth']), n_estimators=params['n_estimators'], learning_rate=params['learning_rate'])
		clf.fit(Xs_train[i].toarray(), l)
		print "[%s] [INFO] %d model training done" % (t_now(), i)
		preds_train = clf.staged_predict(Xs_train[i].toarray())
		ans_train.append([item for item in preds_train])
		# print "len(ans_train[%d]) = %d" % (i, len(ans_train[i]))
		print "[%s] [INFO] %d model predict for training data set done" % (t_now(), i)
		preds_test = clf.staged_predict(Xs_test[i].toarray())
		ans_test.append([item for item in preds_test])
		print "[%s] [INFO] %d model predict for testing data set done" % (t_now(), i)

	#print "len_ans_train=%d" % len(ans_train[0])

	# predict for testing data set
	for i in range(params['n_estimators']):
		for j in range(10):
			tmp = min(i, len(ans_train[j]) - 1)
			for k in range(n_train):
				ps_train[k][j] = ans_train[j][tmp][k]
			tmp = min(i, len(ans_test[j]) - 1)
			for k in range(n_test):
				ps_test[k][j] = ans_test[j][tmp][k]
		print "%s,%d,%f,%f" % (t_now(), i + 1, mean_cos_similarity(ys_train, ps_train, n_train), mean_cos_similarity(ys_test, ps_test, n_test))

	return 0
def AdaBoost(xTrain, yTrain, xTest, yTest, treeNum):
	rms = dict()
	for trees in treeNum:
		ab = AdaBoostRegressor(n_estimators = trees)
		ab.fit(xTrain, yTrain)
		yPred = ab.predict(xTest)
		rms[trees] = sqrt(mean_squared_error(yTest, yPred))

	(bestRegressor, rmse) = sorted(rms.iteritems(), key = operator.itemgetter(1))[0]

	return bestRegressor, rmse
示例#13
0
class Regressor(BaseEstimator):
    def __init__(self):
        cl = RandomForestRegressor(n_estimators=10, max_depth=10, max_features=10)
        self.clf = AdaBoostRegressor(base_estimator = cl, n_estimators=100)

    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)
#RandomForestClassifier
def ada_boost(data,classifier,sample):
    from sklearn.ensemble import AdaBoostRegressor
    from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
    from sklearn.cluster import KMeans
    from sklearn.naive_bayes import GaussianNB
    func = GaussianNB()
    func = DecisionTreeRegressor()
    func = KMeans(n_clusters=2)
    clf = AdaBoostRegressor(func,n_estimators=300,random_state=random.RandomState(1))
    clf.fit(data,classifier)
    print_result(clf,[sample])
def test_boston():
    # Check consistency on dataset boston house prices.
    reg = AdaBoostRegressor(random_state=0)
    reg.fit(boston.data, boston.target)
    score = reg.score(boston.data, boston.target)
    assert score > 0.85

    # Check we used multiple estimators
    assert len(reg.estimators_) > 1
    # Check for distinct random states (see issue #7408)
    assert_equal(len(set(est.random_state for est in reg.estimators_)),
                 len(reg.estimators_))
示例#16
0
def performAdaBoostReg(train, test, features, output):
    """
    Ada Boost Regression
    """

    clf = AdaBoostRegressor()
    clf.fit(train[features], train[output])
    Predicted = clf.predict(test[features])
    
    plt.plot(test[output])
    plt.plot(Predicted, color='red')
    plt.show()        
    
    return mean_squared_error(test[output],Predicted), r2_score(test[output], Predicted)
示例#17
0
def do_adaboost(filename):
    df, Y = create_merged_dataset(filename)
    # Ideas:
    # Create a feature for accelerations e deacceleration.

    # Leave default base regressor for AdaBoost(decision tree). Extra trees were tried with catastrophic results.
    #ada = AdaBoostRegressor(n_estimators=350, learning_rate=0.05)
    ada = AdaBoostRegressor(n_estimators=500, learning_rate=1)
    
    #X = df.drop(['driver', 'trip', 'prob_points', 'prob_speed', 'prob_distance', 'prob_acceleration'], 1)
    X = df.drop(['driver', 'trip'], 1)
    ada.fit(X, Y)
    probs = ada.predict(X[:200])
    return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
def predict_volatility_1year_ahead(rows, day):
    """
    SUMMARY: Predict volatility 1 year into the future
    ALGORITHM:
      a) The predictor will train on all data up to exactly 1 year (252 trading days) before `day`
      b) The newest 10 days up to and including `day` will be used as the feature vector for the prediction
         i.e. if day = 0, then the feature vector for prediction will consist of days (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
              if day = 10, then the feature vector for predictor input will be days (10, 11, 12, 13, 14, 15, 16, 17, 19)
    INPUT: minimum of (1 year + 10 days) of data before `day` (newest data is day=0)
  
    """

    #num_days = 10
    num_days = 10

    # enforce that `day` is in the required range
    assert len(rows) >= 252+num_days + day, 'You need to have AT LEAST 252+%d rows AFTER the day index. See predict_volatility_1year_ahead() for details.' % num_days
    assert day >= 0

    # compile features (X) and values (Y) 
    feature_sets = []
    value_sets = []; value_sets_index = []
    for ii in range(day+252, len(rows) - num_days):
        features = []
        for jj in range(num_days):
            day_index = ii + jj
            features += [float(rows[day_index][1]), float(rows[day_index][2]), float(rows[day_index][3]), float(rows[day_index][5]), float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9])]
        feature_sets += [features]
        value_sets += [float(rows[ii-252][9])]
        value_sets_index.append([ii-252])
             
    # fit
    #regr = linear_model.Lasso(alpha=0.01,fit_intercept=False,normalize=False,max_iter=10000000)   # they call lambda alpha
    rng = np.random.RandomState(1)
    regr = AdaBoostRegressor(CustomClassifier(), n_estimators=4, random_state=rng)
    #regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=2, random_state=rng)
    #regr =  DecisionTreeRegressor(max_depth=4)
    regr.fit(feature_sets, value_sets)
    

    #print "Adaboost weights:", regr.estimator_weights_

    ii = day
    features = []
    for jj in range( num_days ):
        day_index = ii + jj    +252    
        features += [float(rows[day_index][1]), float(rows[day_index][2]), float(rows[day_index][3]), float(rows[day_index][5]), float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9])]

    return float(regr.predict([features]))
def test_sample_weight_adaboost_regressor():
    """
    AdaBoostRegressor should work without sample_weights in the base estimator
    The random weighted sampling is done internally in the _boost method in
    AdaBoostRegressor.
    """
    class DummyEstimator(BaseEstimator):

        def fit(self, X, y):
            pass

        def predict(self, X):
            return np.zeros(X.shape[0])

    boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
    boost.fit(X, y_regr)
    assert_equal(len(boost.estimator_weights_), len(boost.estimator_errors_))
示例#20
0
def ada_learning(labels, train, test):
    label_log=np.log1p(labels)
    # try 50 / 1.0
    #boost GradientBoostingRegressor(n_estimators=200, max_depth=8, learning_rate=0.1)
    clf=AdaBoostRegressor(GradientBoostingRegressor(n_estimators=200, max_depth=8, learning_rate=0.1),n_estimators=50, learning_rate=1.0)
    model=clf.fit(train, label_log)
    preds1=model.predict(test)
    preds=np.expm1(preds1)
    return  preds
示例#21
0
    def initGrid(X,y):
        min_samples_split = [2,4,6,8]
        max_depth = [2,4,6,8]
        n_estimators=[50,100,150]
        bootstrap=[False, True]
        min_samples_leaf=[2,4,6,8]

        grid = {
            'min_samples_split':min_samples_split,
            'max_depth': max_depth,
            'min_samples_leaf':min_samples_leaf
        }
        model = DecisionTreeRegressor();
        gs = GridSearchCV(estimator=model, param_grid=grid,  verbose=10, n_jobs=-1)
        gs.fit(X,y)
        print(gs.best_params_)
        search = AdaBoostRegressor(gs)
        search.fit(X,y)
        return search
def run_tree_regressor():
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.cross_validation import cross_val_score
    from sklearn.cross_validation import train_test_split
    import numpy as np
    from sklearn.ensemble import AdaBoostRegressor
 
    print "running me"
    X = np.genfromtxt("/home/john/Downloads/kaggle.X1.train.txt",delimiter=",") # load the text file
    Y = np.genfromtxt("/home/john/Downloads/kaggle.Y.train.txt",delimiter=",") 
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)
     
    rng = np.random.RandomState(1)
 
    depth = 35 # current lowest
    for estimators in [130,235,300,345,450]:
        treeAdaBoost =  AdaBoostRegressor(DecisionTreeRegressor(max_depth=depth),n_estimators=estimators, random_state=rng)
        treeAdaBoost.fit(x_train, y_train)
        print "adabost estimators @ " + str(estimators) + ":", treeAdaBoost.score(x_test, y_test)
def round1(X, y):
    # Set parameters
    model = AdaBoostRegressor()
    n = len(y)

    # Perform 5-fold cross validation
    scores = []
    kf = KFold(n, n_folds=5, shuffle=True)

    # Calculate mean absolute deviation for train/test for each fold
    for train_idx, test_idx in kf:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, prediction))
        scores.append(rmse)

    return scores
示例#24
0
def svm_smooth(data, residual_imf, period):
    train_data = []
    lable = []
    for i in range(period,len(residual_imf)-20):
        tmp = data[i-period:i+1]
        train_data.append(tmp)
        lable.append(residual_imf[i])

    rng = np.random.RandomState(1)
    clf = AdaBoostRegressor(svm.SVR(),n_estimators=1, random_state=rng)
    clf.fit(train_data, lable) 
    smooth_data = []
    for i in range(len(data)):
        if i<=period:
            smooth_data.append(data[i])
        else:
            smooth_data.append(clf.predict([data[i-period:i+1]])[0])

    return smooth_data
示例#25
0
def train_model(training, testing, window=5, n=5):
	X_train, y_train = prepare_data(training)
	X_test, y_test = prepare_data(testing)
	rf = RandomForestRegressor()
	rf.fit(X_train, y_train)
	predrf = rf.predict(X_test)
	print "mse for random forest regressor: ", mean_squared_error(predrf, y_test)

	gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.025)
	gb.fit(X_train, y_train)
	predgb = gb.predict(X_test)
	print "mse for gradient boosting regressor: ", mean_squared_error(predgb, y_test)
	## plot feature importance using GBR results
	fx_imp = pd.Series(gb.feature_importances_, index=['bb', 'momentum', 'sma', 'volatility'])
	fx_imp /= fx_imp.max()  # normalize
	fx_imp.sort()
	ax = fx_imp.plot(kind='barh')
	fig = ax.get_figure()
	fig.savefig("output/feature_importance.png")

	adb = AdaBoostRegressor(DecisionTreeRegressor())
	adb.fit(X_train, y_train)
	predadb = adb.predict(X_test)
	print "mse for adaboosting decision tree regressor: ", mean_squared_error(predadb, y_test)

	scale = StandardScaler()
	scale.fit(X_train)
	X_trainscale = scale.transform(X_train)
	X_testscale = scale.transform(X_test)

	knn = BaggingRegressor(KNeighborsRegressor(n_neighbors=10), max_samples=0.5, max_features=0.5)
	knn.fit(X_trainscale, y_train)
	predknn = knn.predict(X_testscale)
	print "mse for bagging knn regressor: ", mean_squared_error(predknn, y_test)

	pred_test = 0.1*predrf+0.2*predgb+0.1*predadb+0.6*predknn
	print "mse for ensemble all the regressors: ", mean_squared_error(pred_test, y_test)
	result = testing.copy()
	result.ix[5:-5, 'trend'] = pred_test
	result.ix[10:, 'pred'] = pred_test * result.ix[5:-5, 'IBM'].values
	result.ix[:-5, 'pred_date'] = result.index[5:]

	return result
def xgb_train(x_train, x_label, x_test):
    model = 'xgb'
    #model = 'adaboost'
    #if model.count('xgb') >0:
    params = {}
    params["objective"] = "reg:linear"
    params["eta"] = 0.005  # [0,1]
    params["min_child_weight"] = 6
    params["subsample"] = 0.7
    params["colsample_bytree"] = 0.7
    params["scale_pos_weight"] = 1.0
    params["silent"] = 1
    params["max_depth"] = 9
    if config.nthread > 1:
        params["nthread"] = 1

    num_rounds = 10000

    xgtrain = xgb.DMatrix(x_train, label=x_label)
    xgval = xgb.DMatrix(x_test)

    #train using early stopping and predict
    watchlist = [(xgtrain, "train")]
    #model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=120, feval=gini_metric)
    model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=120)
    pred1 = model.predict( xgval )

    #clf = RandomForestRegressor()
    #clf = LogisticRegression()
    #clf = GradientBoostingRegressor()
    clf = AdaBoostRegressor( ExtraTreesRegressor(max_depth=9), n_estimators=200 )
    clf.fit(x_train, x_label)
    pred2 = clf.predict(x_test)

    #pred = pred1 * pred2 / (pred1 + pred2)
    #pred = 0.7 * (pred1**0.01) + 0.3 * (pred2**0.01)
    #pred = (pred1.argsort() + pred2.argsort()) / 2
    pred = 0.6 * pred1 + 0.4 * pred2

    return pred
示例#27
0
def train_and_predict_adab_stacked_gbr (train, labels, test, feature_names = None) :
	" Attmept with SVR ... "
	print ("Training ADABoost with GBR as base model")
	t0 = time.clock()
	if (gridSearch) :
		params_dict = {'adab__learning_rate' : [0.1, 0.3]} 
		#model = GridSearchCV(regr, params_dict, n_jobs = 3, cv = kfObject, verbose = 10, scoring = 'mean_squared_error')
	else :
		base =  GradientBoostingRegressor(random_state = randomState, learning_rate = 0.1, n_estimators = 1500, max_depth = 6, subsample = 0.95, max_features = 1, verbose = 10)
		model = AdaBoostRegressor(random_state = randomState, base_estimator = base, n_estimators = 3, learning_rate = 0.005)

	model.fit(train, labels)
	print ("Model fit completed in %.3f sec " %(time.clock() - t0))

	if (gridSearch) :
		print ("Best estimator: ", model.best_estimator_)
		print ("Best MSLE scores: %.4f" %(model.best_score_))
		print ("Best RMSLE score: %.4f" %(math.sqrt(-model.best_score_)))
	else :
		float_formatter = lambda x: "%.4f" %(x)
		print ("Feature importances: ", sorted(zip([float_formatter(x) for x in model.feature_importances_], feature_names), reverse=True))
	
	return model.predict(test)
ridge_model_full_data = ridge.fit(X_train, y_train)

#print('Fitting SVR...')
#svr_model_full_data = svr.fit(X_train,y_train)

print('Fitting GradientBoosting...')
gbr_model_full_data = gbr.fit(X_train, y_train)

print('Fitting XGBoost...')
xgb_model_full_data = xgboost.fit(X_train, y_train)

print('Fitting LightGBM...')
lgb_model_full_data = lightgbm.fit(X_train, y_train)

print('Fitting AdaBoost...')
adaboost_model_full_data = adaboost.fit(X_train, y_train)

#print('Fitting extratrees...')
#extratrees_model_full_data = extratrees.fit(X_train,y_train)

#print('Fitting Bagging...')
#bagging_model_full_data = bagging.fit(X_train,y_train)

print('Done fitting all models')

# In[57]:

#Blending model predictions
print('Blending model predictions...')
#def blend_models_predict(X):
#    return ((0.045 * elastic_model_full_data.predict(X))+(0.5 * lasso_model_full_data.predict(X))
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['Bagging'],'Training_Score': acc_BG_train, 
                              'Test_Score': acc_BG, 'K_Fold_Mean': kf_res_mean, 'K_Fold_Std': kf_res_std})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Model', 'Training_Score','Test_Score','K_Fold_Mean','K_Fold_Std']]
resultsDf


# # Ensemble Technique - AdaBoosting

# In[201]:


from sklearn.ensemble import AdaBoostRegressor
abcl = AdaBoostRegressor(n_estimators=50,random_state=1)
abcl = abcl.fit(X_train, y_train)


# In[204]:


y_predict = abcl.predict(X_test)
abcl_train=abcl.score(X_train , y_train)
print("Ada Boosting - Train Accuracy:",abcl_train)
abcl_test = abcl.score(X_test , y_test)
print("Ada Boosting - Test Accuracy:",abcl_test)

results = cross_val_score(abcl, X, y, cv=kfold, scoring='r2')
print(results)
kf_res_mean=results.mean()*100.0
kf_res_std=results.std()*100.0
示例#30
0
    'linear',
    'square',
    'exponential',
]

min_mean = 999999
minloss = ''
min_n = 0

data_list = []

for loss in losstype:
    for n in range(100, 5000, 100):

        ada_1 = AdaBoostRegressor(n_estimators=n, loss=loss)
        ada_1.fit(X, Y)
        ysame = ada_1.predict(X)
        mean_diff = abs(Y - ysame).mean()

        print loss + ' ' + str(n) + ' ' + str(mean_diff)
        data_list.append([loss, n, mean_diff])

        if (mean_diff < min_mean):
            min_mean = mean_diff
            minloss = loss
            min_n = n

with open("ada_data.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(data_list)
示例#31
0
def price_predictions(ticker, start, end, forecast_out):
    file_path = symbol_to_path(ticker)
    df = pd.read_csv(file_path,
                     index_col="<DTYYYYMMDD>",
                     parse_dates=True,
                     usecols=[
                         "<DTYYYYMMDD>", "<OpenFixed>", "<HighFixed>",
                         "<LowFixed>", "<CloseFixed>", "<Volume>"
                     ],
                     na_values="nan")
    df = df.rename(
        columns={
            '<DTYYYYMMDD>': 'Date',
            "<OpenFixed>": 'Open',
            '<HighFixed>': 'High',
            '<LowFixed>': 'Low',
            '<CloseFixed>': 'Close',
            '<Volume>': 'Volume'
        })

    # columns order for backtrader type
    columnsOrder = ["Open", "High", "Low", "Close", "Volume", "OpenInterest"]
    # change the index by new index
    df = df.reindex(columns=columnsOrder)
    # change date index to increasing order
    df = df.sort_index()
    # take a part of dataframe
    df = df.loc[start:end]

    df['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
    df['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0
    bbwindow = 25
    vlwindow = 10
    mmtum = 10
    df['BB_Value'] = compute_indicator_bb(df, window=bbwindow)
    df['Volatility'] = compute_indicator_volatility(df, timeperiod=vlwindow)
    df['Momentum'] = talib.MOM(df['Close'].values, timeperiod=mmtum)
    df['OBV'] = talib.OBV(df['Close'].values,
                          df['Volume'].values.astype(np.float64))
    df['MACD'], _, _ = talib.MACD(df['Close'].values,
                                  fastperiod=12,
                                  slowperiod=26,
                                  signalperiod=9)
    _, df['STOCH'] = talib.STOCH(df['High'].values,
                                 df['Low'].values,
                                 df['Close'].values,
                                 fastk_period=14,
                                 slowk_period=1,
                                 slowd_period=5)
    df['MFI'] = talib.MFI(df['High'].values,
                          df['Low'].values,
                          df['Close'].values,
                          df['Volume'].values.astype(np.float64),
                          timeperiod=14)
    #    df['EMA3'] = pd.Series(pd.Series.ewm(df['Close'], span = 3, min_periods = 3-1).mean())
    #    df['EMA6'] = pd.Series(pd.Series.ewm(df['Close'], span = 6, min_periods = 6-1).mean())
    #    df['EMA18'] = pd.Series(pd.Series.ewm(df['Close'], span = 18,  min_periods = 18-1).mean())
    df['PDI'] = talib.PLUS_DI(df['High'].values,
                              df['Low'].values,
                              df['Close'].values,
                              timeperiod=14)
    df['NDI'] = talib.MINUS_DI(df['High'].values,
                               df['Low'].values,
                               df['Close'].values,
                               timeperiod=14)
    #    df = df[['Close', 'HL_PCT', 'PCT_change', 'Volume','BB_Value',
    #                        'Volatility', 'Momentum', 'MACD', 'STOCH', 'MFI', 'OBV']]
    #
    df = df[['Close', 'HL_PCT', 'PCT_change', 'Volume', 'BB_Value']]
    df.fillna(method="ffill", inplace=True)
    df.fillna(method="backfill", inplace=True)

    forecast_col = 'Close'

    #inplace : boolean, default False
    # If True, fill in place. Note: this will modify any other views on this object,
    # (e.g. a no-copy slice for a column in a DataFrame).
    # Du bao 1% cua du lieu
    # Copy du lieu tu cot Adj. Close vao cot moi
    # Lenh Shift
    df['Target'] = df[forecast_col].shift(-forecast_out)
    # Lenh Drop loai bo label
    #axis : int or axis name: column
    # Whether to drop labels from the index (0 / ‘index’) or columns (1 / ‘columns’).
    X = np.array(df.drop(['Target'], 1))
    y_true = df[forecast_col][-forecast_out:]
    # Preprocessing Input Data
    X = preprocessing.scale(X)

    #from sklearn.preprocessing import MinMaxScaler
    #scaler = MinMaxScaler()
    #X = scaler.fit_transform(X)

    # Tach gia tri X va X_lately ra khoi chuoi
    X_lately = X[-forecast_out:]

    X = X[:-forecast_out]
    # Loai bo cac gia tri NA
    # df.dropna(inplace=True)
    # Target la vector y lay tu cot label
    y = np.array(df['Target'].dropna())

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    #X_train, X_test, y_train, y_test = train_test_split(X, y)

    #from sklearn.preprocessing import MinMaxScaler
    #from sklearn.preprocessing import StandardScaler
    #scaler = MinMaxScaler()
    #scaler = StandardScaler()
    #X_train = scaler.fit_transform(X_train)
    #X_test = scaler.transform(X_test)
    #X_lately = scaler.transform(X_lately)

    n_neighbors = 5
    knn = neighbors.KNeighborsRegressor(n_neighbors, weights='uniform')
    knn.fit(X_train, y_train)
    print('Train score KNN: ', knn.score(X_train, y_train),
          'Test score KNN : ', knn.score(X_test, y_test))
    forecast_set = knn.predict(X_lately)
    print('Price for next {} days'.format(forecast_out), forecast_set)

    bagging = BaggingRegressor(DecisionTreeRegressor(),
                               n_estimators=50,
                               random_state=50)
    bagging.fit(X_train, y_train)
    print('Train score BAG: ', bagging.score(X_train, y_train),
          'Test score BAG : ', bagging.score(X_test, y_test))
    forecast_set = bagging.predict(X_lately)
    print('Price for next {} days'.format(forecast_out), forecast_set)

    rf = RandomForestRegressor(n_estimators=50, random_state=50)
    rf.fit(X_train, y_train)
    print('Train score RF: ', rf.score(X_train, y_train), 'Test score RF : ',
          rf.score(X_test, y_test))
    forecast_set = rf.predict(X_lately)
    print('Price for next {} days'.format(forecast_out), forecast_set)

    adaboost = AdaBoostRegressor(neighbors.KNeighborsRegressor(n_neighbors=5),
                                 n_estimators=30,
                                 random_state=0)

    #adaboost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
    #                          n_estimators=30, random_state=0)
    adaboost.fit(X_train, y_train)
    print('Train score Ada: ', adaboost.score(X_train, y_train),
          'Test score Ada : ', adaboost.score(X_test, y_test))
    forecast_set = adaboost.predict(X_lately)
    print('Price for next {} days'.format(forecast_out), forecast_set)
    X, y = shuffle(housing_data.data, housing_data.target, random_state=7)

    # Split the data 80/20 (80% for training, 20% for testing)
    num_training = int(0.8 * len(X))
    X_train, y_train = X[:num_training], y[:num_training]
    X_test, y_test = X[num_training:], y[num_training:]

    # Fit decision tree regression model
    dt_regressor = DecisionTreeRegressor(max_depth=4)
    dt_regressor.fit(X_train, y_train)

    # Fit decision tree regression model with AdaBoost
    ab_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                                     n_estimators=400,
                                     random_state=7)
    ab_regressor.fit(X_train, y_train)

    # Evaluate performance of Decision Tree regressor
    y_pred_dt = dt_regressor.predict(X_test)
    mse = mean_squared_error(y_test, y_pred_dt)
    evs = explained_variance_score(y_test, y_pred_dt)
    print("\n#### Decision Tree performance ####")
    print("Mean squared error =", round(mse, 2))

    # Evaluate performance of AdaBoost
    y_pred_ab = ab_regressor.predict(X_test)
    mse = mean_squared_error(y_test, y_pred_ab)
    evs = explained_variance_score(y_test, y_pred_ab)
    print("\n#### AdaBoost performance ####")
    print("Mean squared error =", round(mse, 2))
    print("Explained variance score =", round(evs, 2))
示例#33
0
    0:
    -1]  #names will be replaced by features directly taken from user selection
X = data[feature_cols]
y = data[names[-1]]  #names replaced by target taken from user selection
#print(X.shape)
#print(y.shape)
'''#preprocessing output in integers
le = preprocessing.LabelEncoder()
le.fit(y)
Encoded_classes = list(le.classes_)
y = list(map(int, le.transform(y)))'''

validation_size = 0.20

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=validation_size,
                                                    random_state=10)

# Instantiate
abc = AdaBoostRegressor()

# Fit
abc.fit(X_train, y_train)

# Predict
y_pred = abc.predict(X_test)

accuracy = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print(accuracy)
示例#34
0
import pickle

df = pd.read_csv('hdf_denorm.csv')

## preprocess
#df.rename(columns={'instant':'rec_id',
#'dteday':'datetime',
#'holiday':'is_holiday',
#'workingday':'is_workingday',
#'weathersit':'weather_condition',
#'hum':'humidity',
#'atemp':'felt_temperature',
#'mnth':'month',
#'cnt':'total_count',
#'hr':'hour',
#'yr':'year'},inplace=True)

df = df[['hour','is_holiday', 'weekday','felt_temperature_actual','humidity_actual','users_total']]
#df.is_holiday = df.is_holiday.astype('category')
#df.weekday = df.weekday.astype('category')
df = pd.get_dummies(df)

## modelling
x = df.drop(columns = ['users_total'])
y = df['users_total']

ada = AdaBoostRegressor()
ada.fit(x,y)
pickle.dump(ada, open('my_model.pkl','wb'))

示例#35
0
                                max_depth=8,
                                min_samples_leaf=4,
                                random_state=2)
#scoring(gbr)
gbr = gbr.fit(X_train, y_train)
gbr_accuracy = evaluate(gbr, X_test, y_test)

dtr = DecisionTreeRegressor(min_samples_leaf=3, max_depth=8, random_state=2)
dtr = dtr.fit(X_train, y_train)
dtr_accuracy = evaluate(dtr, X_test, y_test)

abr = AdaBoostRegressor(n_estimators=100,
                        learning_rate=0.1,
                        loss='linear',
                        random_state=2)
abr = abr.fit(X_train, y_train)
abr_accuracy = evaluate(abr, X_test, y_test)


def plot_importances(model, model_name):
    importances = model.feature_importances_
    std = np.std([model.feature_importances_ for feature in model.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Plot the feature importances of the forest
    plt.figure(figsize=(8, 5))
    plt.title("Feature importances of " + model_name)
    plt.bar(range(X_train.shape[1]),
            importances[indices],
            color="r",
示例#36
0
                              n_estimators=n_estimators,
                              sample_size=sample_size,
                              steps=steps,
                              fold=fold,
                              random_state=random_state)
print(X)
print(type(X))
#regr_1.fit(X, y)
y_pred1 = regr_1.predict(x_target_test)

# 4.3 As comparision, use AdaBoostR2 without transfer learning
#==============================================================================
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6),
                           n_estimators=n_estimators)
#==============================================================================
regr_2.fit(x_target_train, y_target_train)
y_pred2 = regr_2.predict(x_target_test)

# 4.4 Plot the results
plt.figure()
plt.scatter(x_target_train, y_target_train, c="k", label="target_train")
plt.plot(x_target_test,
         y_target_test,
         c="b",
         label="target_test",
         linewidth=0.5)
plt.plot(x_target_test,
         y_pred1,
         c="r",
         label="TwoStageTrAdaBoostR2",
         linewidth=2)
示例#37
0
def test_regression_toy():
    # Check classification on a toy dataset.
    clf = AdaBoostRegressor(random_state=0)
    clf.fit(X, y_regr)
    assert_array_equal(clf.predict(T), y_t_regr)

print('SVR')
mod_SVR = model_validation(dat, SVR(), param_RF, t_s=test_size)

#%%
''' Standardize '''
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

''' Regression '''
# Use numpy array from now on
data = df.values[:100]
new_data = df.values[100:]
X = data[:, 1:]
y = data[:, 0]
X_new = new_data[:, 1:]

model = AdaBoostRegressor(n_estimators=50, learning_rate=0.1)

fitter = model.fit(X_train, y_train) # Use all the data from case1Data

predict = fitter.predict(X_new) # Use data from Case1Data.txt







def cal_MSE_firm_onedate_svr(start_date_dt,firm_macro_2,firstdate):
    h=5
    c = 1
    beforeYears=3
    n_steps = 5 # the length of X data 
    n_inputs_1 = 10   # the number of variables
    n_inputs_2 = 7
    num_layers_0=30
    keep_prob=0.5
    #n_neurons = 10
    n_outputs = 1
    learning_rate = 0.0001
    n_epochs = 400
#    global numpy_array_1_1
#    global numpy_array_bias_1_1
#    global numpy_array_1_2
#    global numpy_array_bias_1_2
#    global numpy_array_2_1
#    global numpy_array_bias_2_1
#    global numpy_array_2_2
#    global numpy_array_bias_2_2
        
    
    back_date =  (start_date_dt-relativedelta(years=beforeYears,months=0,days=0)).strftime('%Y-%m-%d')
    start_date=start_date_dt.strftime('%Y-%m-%d')
    firm_hist_t = firm_macro_2[back_date:start_date]

    xydata=firm_hist_t
    #xydata_train =xydata.dropna()
    xydata=xydata.dropna(subset=filter(lambda x: x not in ['VaR_sp500','VaR_sse','VaR_hsi',
                                                           'VaR_sti'],xydata.columns))
    
    xydata_train =xydata
#    xdata = xydata_train[['VOL', 'RET', 'X3T_change', 'change_slope', 'ted', 'cre_spread',
#           'STI','re_excess','equ_vol', 'VOL_2','RET_2','X3T_change_2','change_slope_2','ted_2','cre_spread_2','STI_2',
#           're_excess_2','equ_vol_2','Rsysh']].iloc[:-5,:]
    xdata = xydata_train[['VOL', 'RET', 'HK_equ_vol', 'HIBOR', 'VOL_2', 
           'HIBOR_2','HK_equ_vol_2','Rhsih','Rhsi_t']].iloc[:-5,:]
    std_scale_x = preprocessing.StandardScaler().fit(xdata)
    xdata = std_scale_x.transform(xdata)
    
    ydata = xydata_train[['Rjh']].iloc[:-5,:]

    std_scale_y = preprocessing.StandardScaler().fit(ydata)
    ydata = std_scale_y.transform(ydata)
    
#    xdata_predict = xydata[['VOL', 'RET', 'X3T_change', 'change_slope', 'ted', 'cre_spread',
#           'STI','re_excess','equ_vol', 'VOL_2','RET_2','X3T_change_2','change_slope_2','ted_2','cre_spread_2','STI_2',
#           're_excess_2','equ_vol_2','Rsysh','Varh']].iloc[xydata.shape[0]-1:xydata.shape[0],:]
    xdata_predict = xydata[['VOL', 'RET', 'HK_equ_vol', 'HIBOR', 'VOL_2', 
           'HIBOR_2','HK_equ_vol_2','Rhsih','Rhsi_t',
           'VaR_hsi']].iloc[xydata.shape[0]-1:xydata.shape[0],:]

    xdata_predict2 = xydata[['VOL', 'RET', 'HK_equ_vol', 'HIBOR', 'VOL_2', 
           'HIBOR_2','HK_equ_vol_2','Rhsih',
           'Rhsi_t']].iloc[xydata.shape[0]-1:xydata.shape[0],:]
    
    
    
    y2 = xydata[['Rjh']].iloc[xydata.shape[0]-1:xydata.shape[0],:]
    
    if not math.isnan(xdata_predict.VaR_hsi):
        xdata_predict.Rhsih=xdata_predict.VaR_hsi
#    if not math.isnan(xdata_predict.VaR_hsi):
#        xdata_predict.Rhsih=xdata_predict.VaR_hsi
    
    #drop column Varh
    xdata_predict=xdata_predict.drop(labels=['VaR_hsi'], axis=1)
    
    xdata_predict  = std_scale_x.transform(xdata_predict)
    xdata_predict2  = std_scale_x.transform(xdata_predict2)
    
    
    
    
    #xdata_predict = xdata_predict.reshape(1,n_steps,xdata.shape[2])
    

    xdata_startdate = xdata_predict

    
    np.random.seed(0)
    regr = AdaBoostRegressor(random_state=0, n_estimators=100)
    
    regr.fit(xdata,ydata.ravel())
    MES_startdate=regr.predict(xdata_startdate)
    y_predict=regr.predict(xdata_predict2)
    
    MES_startdate = std_scale_y.inverse_transform(MES_startdate)
    print(MES_startdate)
    
    y_predict = std_scale_y.inverse_transform(y_predict)
    loss_y = pow((y2.iat[0,0]-y_predict[0]),2)
    print(loss_y)

    
    if MES_startdate[0]<-1:
        MES_startdate[0]=-1
    #if MES_startdate[0,0]>0:
    #    MES_startdate[0,0]=0
    return (-MES_startdate[0],loss_y)
def test_regression_toy():
    # Check classification on a toy dataset.
    clf = AdaBoostRegressor(random_state=0)
    clf.fit(X, y_regr)
    assert_array_equal(clf.predict(T), y_t_regr)
示例#41
0
def train_adboost_cart(data, avg={}):
    test_X, test_Y = load_data(data, avg)
    adaboost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), loss="square", learning_rate=0.01, n_estimators=500)
    adaboost.fit(test_X, test_Y)
    return adaboost
示例#42
0
# 학습모델 구축을 위해 data형식을 Vector로 변환
AB_X1 = AB_m_Inputdata.values
AB_Y1 = AB_m_Outputdata.values

# Training Data, Test Data 분리
AB_X1_train, AB_X1_test, AB_Y1_train, AB_Y1_test = train_test_split(
    AB_X1, AB_Y1, test_size=0.33, random_state=42)

########################################################################################################################
# AdaBoost 학습 모델 구축
making_adaboost_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=10),
                                          n_estimators=100,
                                          learning_rate=0.5,
                                          random_state=42)

making_adaboost_model.fit(AB_X1_train, AB_Y1_train)

AB_m_predicted = making_adaboost_model.predict(AB_X1_test)

# [1,n]에서 [n,1]로 배열을 바꿔주는 과정을 추가
AB_length_x1test = len(AB_X1_test)
AB_m_predicted = AB_m_predicted.reshape(AB_length_x1test, 1)

# 학습 모델 성능 확인
AB_m_mae = abs(AB_m_predicted - AB_Y1_test).mean(axis=0)
AB_m_mape = (np.abs((AB_m_predicted - AB_Y1_test) / AB_Y1_test).mean(axis=0))
AB_m_rmse = np.sqrt(((AB_m_predicted - AB_Y1_test)**2).mean(axis=0))
AB_m_rmsle = np.sqrt(
    (((np.log(AB_m_predicted + 1) - np.log(AB_Y1_test + 1))**2).mean(axis=0)))

print(AB_m_mae)
示例#43
0
def AdaBoost(train_features, test_feat, train_labels):
    clf = AdaBoostRegressor()
    clf.fit(train_features, train_labels)
    pred_test_labels = clf.predict(test_feat)
    return [pred_test_labels, clf]
示例#44
0
print(u'score 准确率为 %.4lf' % acc_decision_tree)
# K 折交叉验证统计决策树准确率
print(u'cross_val_score 准确率为 %.4lf' %
      np.mean(cross_val_score(clf, train_features, train_labels, cv=10)))

# 房价预测
# 加载数据
data = load_boston()
# 分割数据
train_x, test_x, train_y, test_y = train_test_split(data.data,
                                                    data.target,
                                                    test_size=0.25,
                                                    random_state=33)
# 使用 AdaBoost 回归模型
regressor = AdaBoostRegressor()
regressor.fit(train_x, train_y)
pred_y = regressor.predict(test_x)
mse = mean_squared_error(test_y, pred_y)
print(" 房价预测结果 ", pred_y)
print(" 均方误差 = ", round(mse, 2))

# 使用决策树回归模型
dec_regressor = DecisionTreeRegressor()
dec_regressor.fit(train_x, train_y)
pred_y = dec_regressor.predict(test_x)
mse = mean_squared_error(test_y, pred_y)
print(" 决策树均方误差 = ", round(mse, 2))
# 使用 KNN 回归模型
knn_regressor = KNeighborsRegressor()
knn_regressor.fit(train_x, train_y)
pred_y = knn_regressor.predict(test_x)
示例#45
0
from sklearn.tree import DecisionTreeRegressor

# Create the dataset
rng = np.random.RandomState(1)
X = np.linspace(0, 6, 100)[:, np.newaxis]
y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])
# dataArr, labelArr = loadDataSet("7. AdaBoost/horseColicTraining2.txt")

# Fit regression model
regr_1 = DecisionTreeRegressor(max_depth=4)
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                           n_estimators=300,
                           random_state=rng)

regr_1.fit(X, y)
regr_2.fit(X, y)

# Predict
y_1 = regr_1.predict(X)
y_2 = regr_2.predict(X)

# Plot the results
plt.figure()
plt.scatter(X, y, c="k", label="training samples")
plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2)
plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Boosted Decision Tree Regression")
plt.legend()
plt.show()
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.legend(["XG Boost Regression"])
plt.show()

#Fitting AdaBoostRegressor
min = 1000
for dep in [10, 15, 18, 25]:
    for esti in [550, 575, 600]:
        for lr in [0.01, 0.3, 1.25, 1.5]:
            regr_ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=dep),
                                         n_estimators=esti,
                                         random_state=0,
                                         learning_rate=lr,
                                         loss="exponential")
            regr_ada.fit(X_train, Y_train)
            ada_pred = regr_ada.predict(X_CV)
            RMLSE = np.sqrt(mean_squared_log_error(Y_CV, ada_pred))
            #print ("Error (AdaBoostRegressor)=",RMLSE," for depth=",dep," for estimators=",esti," and learning rate=",lr)
            if (min > RMLSE):
                min = RMLSE
                lr_f = lr
                esti_f = esti
                dep_f = dep

print(
    "Root Mean Square Logarithmic Cross Validation Error (AdaBoostRegressor)=",
    RMLSE, " for depth=", dep, " for estimators=", esti, " and learning rate=",
    lr)
regr_ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=dep_f),
                             n_estimators=esti_f,
示例#47
0
wderrn = np.array(errn)[np.array(errn) <= 20]
wderrn = wderrn[wderrn >= -20]
wderrn.size / len(errn)

np.median(wderrn)
np.mean(wderrn)

plt.figure()
plt.plot(wderrn)

x = np.linspace(0, 8760, num=8760)[:, np.newaxis]
y = nord['FABBISOGNO REALE'].ix[nord.index.year == 2015].values.ravel()
regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=24),
                         n_estimators=3000)

regr.fit(x, y)
yhat = regr.predict(x)

plt.figure()
plt.plot(yhat, color='blue', marker='o')
plt.plot(y, color='red')

plt.figure()
plt.plot(y - yhat)

#### fabbisogno 2009
sbil2009 = pd.read_excel(
    'C:/Users/utente/Documents/misure/aggregato_sbilanciamento2009.xlsx')
nord2009 = sbil2009.ix[sbil2009['CODICE RUC'] == 'UC_DP1608_NORD']
nord2009.index = pd.date_range('2009-01-01', '2010-01-02',
                               freq='H')[:nord2009.shape[0]]
示例#48
0
        dt.predict(X[:10])


print("profiling...")
txt = profile(runlocaldt, pyinst_format='text')
print(txt[1])

###########################################
# Profiling for AdaBoostRegressor
# +++++++++++++++++++++++++++++++
#
# The next example shows how long the python runtime
# spends in each operator.

ada = AdaBoostRegressor()
ada.fit(X, y)
onx = to_onnx(ada, X[:1].astype(numpy.float32), target_opset=11)
oinf = OnnxInference(onx, runtime='python_compiled')
print(oinf)

########################################
# The profiling.


def runlocal():
    for i in range(0, 500):
        oinf.run({'X': X32})


print("profiling...")
txt = profile(runlocal, pyinst_format='text')
示例#49
0
model_parameter = "adaboost_lr0p1_lossSquare_nest1000_minsamplesplit2_maxdepth5_sqrt_random_skipdatacolumn012"

# In[ ]:

#estimator for adaboost
ada_tree_estimator = DecisionTreeRegressor(min_samples_split=2,
                                           max_depth=5,
                                           max_features='sqrt',
                                           splitter='random')
#adaboost regressor
ab = AdaBoostRegressor(ada_tree_estimator,
                       learning_rate=0.1,
                       loss='square',
                       n_estimators=1000)
#fit
ab.fit(X_train, Y_train)

# ## Validation

# In[ ]:


def visualize_loss_curves(model, X_train, Y_train, X_test, Y_test, cut):
    plt.close('all')

    Y_train_pred = np.zeros_like(Y_train)
    Y_test_pred = np.zeros_like(Y_test)
    losses_train = []
    losses_test = []

    # For each added classifier, store the new training and test losses.
示例#50
0
clf_1 = ensemble.GradientBoostingClassifier()
clf_2 = AdaBoostRegressor(ensemble.GradientBoostingClassifier(),
                          n_estimators=50,
                          random_state=None)

# CV Loop
for train_index, test_index in kf:
    # for each iteration of the for loop we'll do a test train split
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    t = StandardScaler()
    X_train = t.fit_transform(X_train)
    clf_1.fit(X_train, y_train)  # Train clf_1 on the training data
    clf_2.fit(X_train, y_train)  # Train clf_2 on the training data

    X_test = t.transform(X_test)
    y_pred1[test_index] = clf_1.predict(
        X_test)  # Predict clf_1 using the test and store in y_pred
    y_pred2[test_index] = clf_2.predict(
        X_test)  # Predict clf_2 using the test and store in y_pred

plot_r2(y, y_pred1, "Performance of CV DecisionTreeRegressor")
plt.show()
r2_score(y, y_pred1)
rmse = sqrt(mean_squared_error(y, y_pred1))

print("GradientBoostingClassifier CV 1 rmse: ", rmse)

plot_r2(y, y_pred2, "Performance of CV AdaBoost")
示例#51
0
# --> 0.37737
r2_score(y_test, y_pred)
# --> 0.62263

############################################################
## Decision tree regression with AdaBoost
from sklearn.ensemble import AdaBoostRegressor
regressor2 = AdaBoostRegressor(DecisionTreeRegressor(
    criterion='mse',
    max_depth=5,
    max_features=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=1,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    presort=False,
    splitter='best'),
                               n_estimators=500,
                               random_state=42)

regressor2.fit(X_train, y_train)

# predict
y_pred = regressor2.predict(X_test)
MSE = mean_squared_error(y_test, y_pred)
# --> 0.37569
r2_score(y_test, y_pred)
# --> 0.6243
示例#52
0
#mse in $
mse = mean_absolute_error(y_test, y_pred)
print("The mean absolute error is:$", mse)
#chceking r^2
from sklearn.metrics import r2_score

print("r_Score:", r2_score(y_test, y_pred))

bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10)
bg.fit(X_train, y_train)
bg.score(X_train, y_train)
bg.score(X_test, y_test)

#Adaboosting
regr = AdaBoostRegressor()
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

#Decision
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

#gradientBoost
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
gb.score(X_train, y_train)
gb.score(X_test, y_test)
示例#53
0
svr = SVR(kernel='rbf')
svr_fit = svr.fit(Rtrain_X, Rtrain_y)

# 預測
#svr = svr.predict(train_X)
svr_test_y_predicted = svr_fit.predict(Rtest_X)
# 績效
Train_r2 = r2_score(Rtrain_y, svr_fit.predict(Rtrain_X))
print('Train R2: ', Train_r2)
PCCs = np.corrcoef(svr_test_y_predicted, Rtest_y)
RMSE = (mean_squared_error(Rtest_y, svr_test_y_predicted))**(1 / 2)
R_squared = r2_score(Rtest_y, svr_test_y_predicted)
print('r2:', R_squared)
print(PCCs)
print(RMSE)
#__________________________________________________________________
'''Adaboost Regression'''
from sklearn.ensemble import AdaBoostRegressor
abtR = AdaBoostRegressor()  #n_estimators=1000)

abtR.fit(Rtrain_X, Rtrain_y)

abtR_predicted = abtR.predict(Rtest_X)
abtR_PCCs = np.corrcoef(abtR_predicted, Rtest_y)
abtR_RMSE = (mean_squared_error(Rtest_y, abtR_predicted))**(1 / 2)
abtR_R_squared = r2_score(Rtest_y, abtR_predicted)
print('TRAIN SCORE: ', abtR.score(Rtrain_X, Rtrain_y), ' TEST SCORE: ',
      abtR.score(Rtest_X, Rtest_y), '\n')
print('#_____________________________________________________', '\n')

#__________________________________________________________________
示例#54
0
h_GbrModel = GradientBoostingRegressor()
h_rdModel = RandomForestRegressor()
h_sgdModel = SGDRegressor()
h_elnModel = ElasticNet()
x_train, x_test, y_train, y_test = model_selection.train_test_split(h_data.data,h_data.target, test_size = 0.3)

#h_normalizer = Normalizer()
h_scaler = MinMaxScaler()
#h_data.data = h_normalizer.fit_transform(h_data.data)
h_scaler.fit(x_train)
h_scaler.transform(x_test)

h_LnModel.fit(x_train,y_train)
h_SVRModel.fit(x_train,y_train)
h_nnModel.fit(x_train,y_train)
h_adaModel.fit(x_train,y_train)
h_GbrModel.fit(x_train,y_train)
h_rdModel.fit(x_train,y_train)
h_sgdModel.fit(x_train,y_train)
h_elnModel.fit(x_train,y_train)


print(metrics.r2_score(h_LnModel.predict(x_test),y_test))
print(metrics.r2_score(h_SVRModel.predict(x_test),y_test))
print(metrics.r2_score(h_nnModel.predict(x_test),y_test))
print(metrics.r2_score(h_adaModel.predict(x_test),y_test))
print(metrics.r2_score(h_GbrModel.predict(x_test),y_test))
print(metrics.r2_score(h_rdModel.predict(x_test),y_test))
print(metrics.r2_score(h_sgdModel.predict(x_test),y_test))
print(metrics.r2_score(h_elnModel.predict(x_test),y_test))
示例#55
0
from sklearn.model_selection import train_test_split

X_dev,X_eval, y_dev,y_eval = train_test_split(X, y,
                                              test_size=0.33, random_state=42)
X_train,X_test, y_train,y_test = train_test_split(X_dev, y_dev,
                                                  test_size=0.33, random_state=492)


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
rng = np.random.RandomState(1)
bdt = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                          n_estimators=300, random_state=rng)

bdt.fit(X_train, y_train)
print('BDT fitted')

import pickle
pickle.dump(bdt,open("bdt.joblib","wb"))

bdt = pickle.load(open("bdt.joblib","rb"))


y_predicted = bdt.predict(X)
y_predicted.dtype = [('Trigger_correction', 'float64')]

print(type(y_predicted))
print(len(y_predicted))
from root_numpy import array2root
示例#56
0
from sklearn.ensemble import AdaBoostRegressor

train = pd.read_csv('parkinsons_train.csv')
test = pd.read_csv('parkinsons_test.csv')

features = ["MDVP:Fo(Hz)", "MDVP:Fhi(Hz)", "MDVP:Flo(Hz)", "MDVP:Jitter(%)", "MDVP:Jitter(Abs)", "MDVP:RAP", "MDVP:PPQ", "Jitter:DDP", "MDVP:Shimmer", "MDVP:Shimmer(dB)", "Shimmer:APQ3", "Shimmer:APQ5", "MDVP:APQ", "Shimmer:DDA", "NHR", "HNR", "RPDE", "DFA", "spread1", "spread2", "D2", "PPE"]
# 64%
X = train[features]
y = train['status']

temp = test['status']

i = 1
j = 1

results = []
regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=100), n_estimators=100, random_state=)
regr.fit(X,y)
'''
for i,j in range(1000):
	acc = accuracy_score(regr.predict(test[jitter], y))
	if results.length() == 0 or acc > results[2]:
		results = [i, j, acc]

print('Use max_depth: ' + results[0] + ' , n_estimators: ' + results[1] + ' to get a maximum accuracy score of ' + results[2])
i = results[0]
j = results[1]
'''
pred5 = regr.predict(test[jitter])

print(str(accuracy_score(pred5, temp)))
X, y = shuffle(boston.data, boston.target)
offset = int(0.7*len(X))
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]

# We will vary the number of base learners from 2 to 300
max_learners = arange(2, 300)
train_err = zeros(len(max_learners))
test_err = zeros(len(max_learners))

for i, l in enumerate(max_learners):
	# Set up a Adaboost Regression Learner with l base learners
    regressor = AdaBoostRegressor(n_estimators=l)

    # Fit the learner to the training data
    regressor.fit(X_train, y_train)

    # Find the MSE on the training set
    train_err[i] = mean_squared_error(y_train, regressor.predict(X_train))
    # Find the MSE on the testing set
    test_err[i] = mean_squared_error(y_test, regressor.predict(X_test))

# Plot training and test error as a function of the number of base learners
pl.figure()
pl.title('Boosting: Performance vs Number of Learners')
pl.plot(max_learners, test_err, lw=2, label = 'test error')
pl.plot(max_learners, train_err, lw=2, label = 'training error')
pl.legend()
pl.xlabel('Number of Learners')
pl.ylabel('RMS Error')
pl.show()
示例#58
0
	def def_AdaBoostRegressor(self, estimators):
		abr_ = AdaBoostRegressor(n_estimators=estimators)
		abr_.fit(self.Xtrain, self.Ytrain)
		pred = abr_.predict(self.Xtest)

		return pred
示例#59
0
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, spredictions)))

coeffecients = pd.DataFrame(sgd.coef_,X.columns)
coeffecients.columns = ['Coeffecient']
#print(coeffecients)





#ADABOOST
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
from sklearn.ensemble import AdaBoostRegressor
abreg = AdaBoostRegressor(random_state=0, n_estimators=100)
abreg.fit(X_train,y_train)

abpredictions = abreg.predict( X_test)
#print(abpredictions)

plt.scatter(y_test,abpredictions)
plt.title('ADABOOST')
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, abpredictions))
print('MSE:', metrics.mean_squared_error(y_test, abpredictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, abpredictions)))
def decision_tree(X, y1, y2, y3):
    n, _ = X.shape
    nTrain = int(0.5 * n)  #training on 50% of the data
    Xtrain = X[:nTrain, :]
    ytrain = y1[:nTrain]
    ytrain_registered = y2[:nTrain]
    ytest_registered = y2[nTrain:]
    ytrain_casual = y3[:nTrain]
    ytest_casual = y3[nTrain:]
    Xtest = X[nTrain:, :]
    ytest = y1[nTrain:]

    #regular

    clf_1 = DecisionTreeRegressor(max_depth=None)
    clf_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None),
                              n_estimators=500)
    clf_4 = RandomForestRegressor(n_estimators=500,
                                  max_depth=None,
                                  min_samples_split=1,
                                  random_state=0)
    clf_5 = ExtraTreesRegressor(n_estimators=500,
                                max_depth=None,
                                min_samples_split=1,
                                random_state=0)
    clf_3 = GradientBoostingRegressor(n_estimators=500,
                                      max_depth=None,
                                      random_state=0)

    print "finished generating tree"

    clf_1.fit(Xtrain, ytrain_registered)
    clf_2.fit(Xtrain, ytrain_registered)
    clf_3.fit(Xtrain, ytrain_registered)
    clf_4.fit(Xtrain, ytrain_registered)
    clf_5.fit(Xtrain, ytrain_registered)

    print 'Finished fitting'

    dt_regular = clf_1.predict(Xtest)
    ada_regular = clf_2.predict(Xtest)
    grad_regular = clf_3.predict(Xtest)
    rf_regular = clf_4.predict(Xtest)
    et_regular = clf_5.predict(Xtest)

    #casual
    print "finished generating tree"

    clf_1.fit(Xtrain, ytrain_casual)
    clf_2.fit(Xtrain, ytrain_casual)
    clf_3.fit(Xtrain, ytrain_casual)
    clf_4.fit(Xtrain, ytrain_casual)
    clf_5.fit(Xtrain, ytrain_casual)

    print 'Finished fitting'

    dt_casual = clf_1.predict(Xtest)
    ada_casual = clf_2.predict(Xtest)
    grad_casual = clf_3.predict(Xtest)
    rf_casual = clf_4.predict(Xtest)
    et_casual = clf_5.predict(Xtest)
    feature_imps = clf_4.feature_importances_

    print "regular decision tree"
    print rmsle(ytest, dt_regular + dt_casual)
    print "boosted decision tree"
    print rmsle(ytest, ada_regular + ada_casual)
    print "gradient tree boosting"
    print rmsle(ytest, grad_regular + grad_casual)
    print "random forest classifier"
    print rmsle(ytest, rf_regular + rf_casual)
    print "extra trees classifier"
    print rmsle(ytest, et_casual + et_regular)

    print "feature importances"
    print feature_imps