class Regressor(BaseEstimator): def __init__(self): self.clf = AdaBoostRegressor(RandomForestRegressor(n_estimators=100, max_depth=40, max_features=25), n_estimators=100) #self.clf_Boost = GradientBoostingRegressor( n_estimators = 500 , max_features = 20 ) #self.clf_Regression = LinearRegression() def fit(self, X, y): self.clf.fit(X,y) def predict(self, X): return self.clf.predict(X)
def fit(self, start_date, end_date): for ticker in self.tickers: self.stocks[ticker] = Stock(ticker) params_ada = [{ 'n_estimators': [25, 50, 100], 'learning_rate': [0.01, 0.1, 1, 10], 'loss': ['linear', 'square', 'exponential'] }] params = ParameterGrid(params_ada) # Find the split for training and CV mid_date = train_test_split(start_date, end_date) for ticker, stock in self.stocks.items(): X_train, y_train = stock.get_data(start_date, mid_date, fit=True) # X_train = self.pca.fit_transform(X_train.values) X_train = X_train.values # pdb.set_trace() X_cv, y_cv = stock.get_data(mid_date, end_date) # X_cv = self.pca.transform(X_cv.values) X_cv = X_cv.values lowest_mse = np.inf for i, param in enumerate(params): ada = AdaBoostRegressor(**param) ada.fit(X_train, y_train.values) mse = mean_squared_error( y_cv, ada.predict(X_cv)) if mse <= lowest_mse: self.models[ticker] = ada return self
def Round2(X, y): # Set parameters min_score = {} for loss in ['linear', 'square', 'exponential']: model = AdaBoostRegressor(loss=loss) n = len(y) # Perform 5-fold cross validation scores = [] kf = KFold(n, n_folds=5, shuffle=True) # Calculate mean absolute deviation for train/test for each fold for train_idx, test_idx in kf: X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] model.fit(X_train, y_train) prediction = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, prediction)) # score = model.score(X_test, y_test) scores.append(rmse) if len(min_score) == 0: min_score['loss'] = loss min_score['scores'] = scores else: if np.mean(scores) < np.mean(min_score['scores']): min_score['loss'] = loss min_score['scores'] = scores print "Loss:", loss print scores print np.mean(scores) return min_score
def round2(X_df, featurelist): # Set parameters model = AdaBoostRegressor() y_df = X_df['target'] n = len(y_df) # Perform 5-fold cross validation scores = [] kf = KFold(n, n_folds=5, shuffle=True) # Calculate mean absolute deviation for train/test for each fold for train_idx, test_idx in kf: X_train, X_test = X_df.iloc[train_idx, :], X_df.iloc[test_idx, :] # y_train, y_test = y_df[train_idx], y_df[test_idx] X_train, X_test = applyFeatures(X_train, X_test, featurelist) Xtrain_array, ytrain_array, Xtest_array, ytest_array = dfToArray(X_train, X_test) model.fit(Xtrain_array, ytrain_array) prediction = model.predict(Xtest_array) rmse = np.sqrt(mean_squared_error(ytest_array, prediction)) scores.append(rmse) print rmse print "Finish fold" return scores
def ada_boost_regressor(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True): """ :param train_x: train :param train_y: text :param pred_x: test set to predict :param review_id: takes in a review id :param v_curve: run the model for validation curve :param l_curve: run the model for learning curve :param get_model: run the model :return: the predicted values,learning curve, validation curve """ ada = AdaBoostRegressor(n_estimators=5) if get_model: print "Fitting Ada..." ada.fit(train_x, np.log(train_y+1)) ada_pred = np.exp(ada.predict(pred_x))-1 Votes = ada_pred[:,np.newaxis] Id = np.array(review_id)[:,np.newaxis] # create submission csv for Kaggle submission_ada= np.concatenate((Id,Votes),axis=1) np.savetxt("submission_ada.csv", submission_ada,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='') # plot validation and learning curves if l_curve: print "Working on Learning Curves" plot_learning_curve(AdaBoostRegressor(), "Learning curve: Adaboost", train_x, np.log(train_y+1.0)) if v_curve: print "Working on Validation Curves" plot_validation_curve(AdaBoostRegressor(), "Validation Curve: Adaboost", train_x, np.log(train_y+1.0), param_name="n_estimators", param_range=[2, 5, 10, 15, 20, 25, 30])
def train_learning_model_decision_tree_ada_boost(df): #code taken from sklearn X_all, y_all = preprocess_data(df) X_train, X_test, y_train, y_test = split_data(X_all, y_all) tree_regressor = DecisionTreeRegressor(max_depth = 6) ada_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators = 500, learning_rate = 0.01, random_state = 1) tree_regressor.fit(X_train, y_train) ada_regressor.fit(X_train, y_train) y_pred_tree = tree_regressor.predict(X_test) y_pred_ada = ada_regressor.predict(X_test) mse_tree = mean_squared_error(y_test, y_pred_tree) mse_ada = mean_squared_error(y_test, y_pred_ada) mse_tree_train = mean_squared_error(y_train, tree_regressor.predict(X_train)) mse_ada_train = mean_squared_error(y_train, ada_regressor.predict(X_train)) print ("MSE tree: %.4f " %mse_tree) print ("MSE ada: %.4f " %mse_ada) print ("MSE tree train: %.4f " %mse_tree_train) print ("MSE ada train: %.4f " %mse_ada_train)
def backTest(trainEndDate, code, testDate, predictDate): conn = db.get_history_data_db('D') df = None # train more date # model = pickle.load(open('%s/%s.pkl' % (config.model_dir, code), 'r')) rng = np.random.RandomState(1) model = AdaBoostRegressor(DecisionTreeRegressor( max_depth=4), n_estimators=1000, random_state=rng, loss='square') df = pd.read_sql_query( "select * from history_data where date([date])<='%s' and code='%s' order by code, date([date]) asc" % ( trainEndDate, code), conn) shift_1 = df['close'].shift(-2) df['target'] = shift_1 data = df[df['target'] > -1000] X_train = data.ix[:, 'code':'turnover'] y_train = data.ix[:, 'target'] if len(X_train) < 500: return print len(X_train) # print data # for i in range(0, 10): # model.fit(X_train, y_train) model.fit(X_train, y_train) # predict tomorrow try: df = pd.read_sql_query(config.sql_history_data_by_code_date % (code, testDate), conn) # print df except Exception, e: print e
def main(): ab = AdaBoostRegressor(base_estimator=None, n_estimators=50, learning_rate=1.0, loss='exponential', random_state=None) ab.fit(X_train, y_train) #Evaluation in train set #Evaluation in train set pred_proba_train = ab.predict(X_train) mse_train = mean_squared_error(y_train, pred_proba_train) rmse_train = np.sqrt(mse_train) logloss_train = log_loss(y_train, pred_proba_train) #Evaluation in validation set pred_proba_val = ab.predict(X_val) mse_val = mean_squared_error(y_val, pred_proba_val) rmse_val = np.sqrt(mse_val) logloss_val = log_loss(y_val, pred_proba_val) rmse_train rmse_val logloss_train logloss_val
def predict_volatility_1year_ahead(rows, day, num_days): """ SUMMARY: Predict volatility 1 year into the future ALGORITHM: a) The predictor will train on all data up to exactly 1 year (252 trading days) before `day` b) The newest 10 days up to and including `day` will be used as the feature vector for the prediction i.e. if day = 0, then the feature vector for prediction will consist of days (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) if day = 10, then the feature vector for predictor input will be days (10, 11, 12, 13, 14, 15, 16, 17, 19) INPUT: minimum of (1 year + 10 days) of data before `day` (newest data is day=0) """ '''enforce that `day` is in the required range''' assert len(rows) >= 252+num_days + day, 'You need to have AT LEAST 252+%d rows AFTER the day index. See predict_volatility_1year_ahead() for details.' % num_days assert day >= 0 '''Compile features for fitting''' feature_sets = [] value_sets = []; for ii in range(day+num_days+252, len(rows) - num_days): features = [] for jj in range(num_days): day_index = ii + jj features += [ float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9]), float(rows[day_index][10]), float(rows[day_index][11]), float(rows[day_index][12]), float(rows[day_index][13]), ] #print("issue here: " + str(rows[day_index][0])) feature_sets += [features] value_sets += [float(rows[ii-252][9])] '''Create Regressor and fit''' num_features = 16 rng = np.random.RandomState(1) regr = AdaBoostRegressor(CustomClassifier(), n_estimators=3, random_state=rng) regr.fit(feature_sets, value_sets) '''Get prediction features''' ii = day features = [] for jj in range( num_days ): day_index = ii + jj features += [ float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9]), float(rows[day_index][10]), float(rows[day_index][11]), float(rows[day_index][12]), float(rows[day_index][13]), ] return float(regr.predict([features]))
class Regressor(BaseEstimator): def __init__(self): self.clf = AdaBoostRegressor(RandomForestRegressor(n_estimators=500, max_depth=78, max_features=10), n_estimators=40) def fit(self, X, y): self.clf.fit(X, y) def predict(self, X): return self.clf.predict(X)
def train_predict(train_id, test_id): # load libsvm files for training dataset Xs_train = [] ys_train = [] n_train = load_libsvm_files(train_id, Xs_train, ys_train) # load libsvm files for testing dataset Xs_test = [] ys_test = [] n_test = load_libsvm_files(test_id, Xs_test, ys_test) # models model = [] # ans ans_train = [] ans_test = [] # generate predictions for training dataset ps_train = [] for i in range(0, n_train): ps_train.append([0.0 for j in range(10)]) # generate predictions for testing dataset ps_test = [] for i in range(0, n_test): ps_test.append([0.0 for j in range(10)]) # fit models for i in range(10): l = np.array([ys_train[j][i] for j in range(n_train)]) clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=params['max_depth']), n_estimators=params['n_estimators'], learning_rate=params['learning_rate']) clf.fit(Xs_train[i].toarray(), l) print "[%s] [INFO] %d model training done" % (t_now(), i) preds_train = clf.staged_predict(Xs_train[i].toarray()) ans_train.append([item for item in preds_train]) # print "len(ans_train[%d]) = %d" % (i, len(ans_train[i])) print "[%s] [INFO] %d model predict for training data set done" % (t_now(), i) preds_test = clf.staged_predict(Xs_test[i].toarray()) ans_test.append([item for item in preds_test]) print "[%s] [INFO] %d model predict for testing data set done" % (t_now(), i) #print "len_ans_train=%d" % len(ans_train[0]) # predict for testing data set for i in range(params['n_estimators']): for j in range(10): tmp = min(i, len(ans_train[j]) - 1) for k in range(n_train): ps_train[k][j] = ans_train[j][tmp][k] tmp = min(i, len(ans_test[j]) - 1) for k in range(n_test): ps_test[k][j] = ans_test[j][tmp][k] print "%s,%d,%f,%f" % (t_now(), i + 1, mean_cos_similarity(ys_train, ps_train, n_train), mean_cos_similarity(ys_test, ps_test, n_test)) return 0
def AdaBoost(xTrain, yTrain, xTest, yTest, treeNum): rms = dict() for trees in treeNum: ab = AdaBoostRegressor(n_estimators = trees) ab.fit(xTrain, yTrain) yPred = ab.predict(xTest) rms[trees] = sqrt(mean_squared_error(yTest, yPred)) (bestRegressor, rmse) = sorted(rms.iteritems(), key = operator.itemgetter(1))[0] return bestRegressor, rmse
class Regressor(BaseEstimator): def __init__(self): cl = RandomForestRegressor(n_estimators=10, max_depth=10, max_features=10) self.clf = AdaBoostRegressor(base_estimator = cl, n_estimators=100) def fit(self, X, y): self.clf.fit(X, y) def predict(self, X): return self.clf.predict(X) #RandomForestClassifier
def ada_boost(data,classifier,sample): from sklearn.ensemble import AdaBoostRegressor from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.cluster import KMeans from sklearn.naive_bayes import GaussianNB func = GaussianNB() func = DecisionTreeRegressor() func = KMeans(n_clusters=2) clf = AdaBoostRegressor(func,n_estimators=300,random_state=random.RandomState(1)) clf.fit(data,classifier) print_result(clf,[sample])
def test_boston(): # Check consistency on dataset boston house prices. reg = AdaBoostRegressor(random_state=0) reg.fit(boston.data, boston.target) score = reg.score(boston.data, boston.target) assert score > 0.85 # Check we used multiple estimators assert len(reg.estimators_) > 1 # Check for distinct random states (see issue #7408) assert_equal(len(set(est.random_state for est in reg.estimators_)), len(reg.estimators_))
def performAdaBoostReg(train, test, features, output): """ Ada Boost Regression """ clf = AdaBoostRegressor() clf.fit(train[features], train[output]) Predicted = clf.predict(test[features]) plt.plot(test[output]) plt.plot(Predicted, color='red') plt.show() return mean_squared_error(test[output],Predicted), r2_score(test[output], Predicted)
def do_adaboost(filename): df, Y = create_merged_dataset(filename) # Ideas: # Create a feature for accelerations e deacceleration. # Leave default base regressor for AdaBoost(decision tree). Extra trees were tried with catastrophic results. #ada = AdaBoostRegressor(n_estimators=350, learning_rate=0.05) ada = AdaBoostRegressor(n_estimators=500, learning_rate=1) #X = df.drop(['driver', 'trip', 'prob_points', 'prob_speed', 'prob_distance', 'prob_acceleration'], 1) X = df.drop(['driver', 'trip'], 1) ada.fit(X, Y) probs = ada.predict(X[:200]) return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
def predict_volatility_1year_ahead(rows, day): """ SUMMARY: Predict volatility 1 year into the future ALGORITHM: a) The predictor will train on all data up to exactly 1 year (252 trading days) before `day` b) The newest 10 days up to and including `day` will be used as the feature vector for the prediction i.e. if day = 0, then the feature vector for prediction will consist of days (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) if day = 10, then the feature vector for predictor input will be days (10, 11, 12, 13, 14, 15, 16, 17, 19) INPUT: minimum of (1 year + 10 days) of data before `day` (newest data is day=0) """ #num_days = 10 num_days = 10 # enforce that `day` is in the required range assert len(rows) >= 252+num_days + day, 'You need to have AT LEAST 252+%d rows AFTER the day index. See predict_volatility_1year_ahead() for details.' % num_days assert day >= 0 # compile features (X) and values (Y) feature_sets = [] value_sets = []; value_sets_index = [] for ii in range(day+252, len(rows) - num_days): features = [] for jj in range(num_days): day_index = ii + jj features += [float(rows[day_index][1]), float(rows[day_index][2]), float(rows[day_index][3]), float(rows[day_index][5]), float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9])] feature_sets += [features] value_sets += [float(rows[ii-252][9])] value_sets_index.append([ii-252]) # fit #regr = linear_model.Lasso(alpha=0.01,fit_intercept=False,normalize=False,max_iter=10000000) # they call lambda alpha rng = np.random.RandomState(1) regr = AdaBoostRegressor(CustomClassifier(), n_estimators=4, random_state=rng) #regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=2, random_state=rng) #regr = DecisionTreeRegressor(max_depth=4) regr.fit(feature_sets, value_sets) #print "Adaboost weights:", regr.estimator_weights_ ii = day features = [] for jj in range( num_days ): day_index = ii + jj +252 features += [float(rows[day_index][1]), float(rows[day_index][2]), float(rows[day_index][3]), float(rows[day_index][5]), float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9])] return float(regr.predict([features]))
def test_sample_weight_adaboost_regressor(): """ AdaBoostRegressor should work without sample_weights in the base estimator The random weighted sampling is done internally in the _boost method in AdaBoostRegressor. """ class DummyEstimator(BaseEstimator): def fit(self, X, y): pass def predict(self, X): return np.zeros(X.shape[0]) boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3) boost.fit(X, y_regr) assert_equal(len(boost.estimator_weights_), len(boost.estimator_errors_))
def ada_learning(labels, train, test): label_log=np.log1p(labels) # try 50 / 1.0 #boost GradientBoostingRegressor(n_estimators=200, max_depth=8, learning_rate=0.1) clf=AdaBoostRegressor(GradientBoostingRegressor(n_estimators=200, max_depth=8, learning_rate=0.1),n_estimators=50, learning_rate=1.0) model=clf.fit(train, label_log) preds1=model.predict(test) preds=np.expm1(preds1) return preds
def initGrid(X,y): min_samples_split = [2,4,6,8] max_depth = [2,4,6,8] n_estimators=[50,100,150] bootstrap=[False, True] min_samples_leaf=[2,4,6,8] grid = { 'min_samples_split':min_samples_split, 'max_depth': max_depth, 'min_samples_leaf':min_samples_leaf } model = DecisionTreeRegressor(); gs = GridSearchCV(estimator=model, param_grid=grid, verbose=10, n_jobs=-1) gs.fit(X,y) print(gs.best_params_) search = AdaBoostRegressor(gs) search.fit(X,y) return search
def run_tree_regressor(): from sklearn.tree import DecisionTreeRegressor from sklearn.cross_validation import cross_val_score from sklearn.cross_validation import train_test_split import numpy as np from sklearn.ensemble import AdaBoostRegressor print "running me" X = np.genfromtxt("/home/john/Downloads/kaggle.X1.train.txt",delimiter=",") # load the text file Y = np.genfromtxt("/home/john/Downloads/kaggle.Y.train.txt",delimiter=",") x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2) rng = np.random.RandomState(1) depth = 35 # current lowest for estimators in [130,235,300,345,450]: treeAdaBoost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=depth),n_estimators=estimators, random_state=rng) treeAdaBoost.fit(x_train, y_train) print "adabost estimators @ " + str(estimators) + ":", treeAdaBoost.score(x_test, y_test)
def round1(X, y): # Set parameters model = AdaBoostRegressor() n = len(y) # Perform 5-fold cross validation scores = [] kf = KFold(n, n_folds=5, shuffle=True) # Calculate mean absolute deviation for train/test for each fold for train_idx, test_idx in kf: X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] model.fit(X_train, y_train) prediction = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, prediction)) scores.append(rmse) return scores
def svm_smooth(data, residual_imf, period): train_data = [] lable = [] for i in range(period,len(residual_imf)-20): tmp = data[i-period:i+1] train_data.append(tmp) lable.append(residual_imf[i]) rng = np.random.RandomState(1) clf = AdaBoostRegressor(svm.SVR(),n_estimators=1, random_state=rng) clf.fit(train_data, lable) smooth_data = [] for i in range(len(data)): if i<=period: smooth_data.append(data[i]) else: smooth_data.append(clf.predict([data[i-period:i+1]])[0]) return smooth_data
def train_model(training, testing, window=5, n=5): X_train, y_train = prepare_data(training) X_test, y_test = prepare_data(testing) rf = RandomForestRegressor() rf.fit(X_train, y_train) predrf = rf.predict(X_test) print "mse for random forest regressor: ", mean_squared_error(predrf, y_test) gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.025) gb.fit(X_train, y_train) predgb = gb.predict(X_test) print "mse for gradient boosting regressor: ", mean_squared_error(predgb, y_test) ## plot feature importance using GBR results fx_imp = pd.Series(gb.feature_importances_, index=['bb', 'momentum', 'sma', 'volatility']) fx_imp /= fx_imp.max() # normalize fx_imp.sort() ax = fx_imp.plot(kind='barh') fig = ax.get_figure() fig.savefig("output/feature_importance.png") adb = AdaBoostRegressor(DecisionTreeRegressor()) adb.fit(X_train, y_train) predadb = adb.predict(X_test) print "mse for adaboosting decision tree regressor: ", mean_squared_error(predadb, y_test) scale = StandardScaler() scale.fit(X_train) X_trainscale = scale.transform(X_train) X_testscale = scale.transform(X_test) knn = BaggingRegressor(KNeighborsRegressor(n_neighbors=10), max_samples=0.5, max_features=0.5) knn.fit(X_trainscale, y_train) predknn = knn.predict(X_testscale) print "mse for bagging knn regressor: ", mean_squared_error(predknn, y_test) pred_test = 0.1*predrf+0.2*predgb+0.1*predadb+0.6*predknn print "mse for ensemble all the regressors: ", mean_squared_error(pred_test, y_test) result = testing.copy() result.ix[5:-5, 'trend'] = pred_test result.ix[10:, 'pred'] = pred_test * result.ix[5:-5, 'IBM'].values result.ix[:-5, 'pred_date'] = result.index[5:] return result
def xgb_train(x_train, x_label, x_test): model = 'xgb' #model = 'adaboost' #if model.count('xgb') >0: params = {} params["objective"] = "reg:linear" params["eta"] = 0.005 # [0,1] params["min_child_weight"] = 6 params["subsample"] = 0.7 params["colsample_bytree"] = 0.7 params["scale_pos_weight"] = 1.0 params["silent"] = 1 params["max_depth"] = 9 if config.nthread > 1: params["nthread"] = 1 num_rounds = 10000 xgtrain = xgb.DMatrix(x_train, label=x_label) xgval = xgb.DMatrix(x_test) #train using early stopping and predict watchlist = [(xgtrain, "train")] #model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=120, feval=gini_metric) model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=120) pred1 = model.predict( xgval ) #clf = RandomForestRegressor() #clf = LogisticRegression() #clf = GradientBoostingRegressor() clf = AdaBoostRegressor( ExtraTreesRegressor(max_depth=9), n_estimators=200 ) clf.fit(x_train, x_label) pred2 = clf.predict(x_test) #pred = pred1 * pred2 / (pred1 + pred2) #pred = 0.7 * (pred1**0.01) + 0.3 * (pred2**0.01) #pred = (pred1.argsort() + pred2.argsort()) / 2 pred = 0.6 * pred1 + 0.4 * pred2 return pred
def train_and_predict_adab_stacked_gbr (train, labels, test, feature_names = None) : " Attmept with SVR ... " print ("Training ADABoost with GBR as base model") t0 = time.clock() if (gridSearch) : params_dict = {'adab__learning_rate' : [0.1, 0.3]} #model = GridSearchCV(regr, params_dict, n_jobs = 3, cv = kfObject, verbose = 10, scoring = 'mean_squared_error') else : base = GradientBoostingRegressor(random_state = randomState, learning_rate = 0.1, n_estimators = 1500, max_depth = 6, subsample = 0.95, max_features = 1, verbose = 10) model = AdaBoostRegressor(random_state = randomState, base_estimator = base, n_estimators = 3, learning_rate = 0.005) model.fit(train, labels) print ("Model fit completed in %.3f sec " %(time.clock() - t0)) if (gridSearch) : print ("Best estimator: ", model.best_estimator_) print ("Best MSLE scores: %.4f" %(model.best_score_)) print ("Best RMSLE score: %.4f" %(math.sqrt(-model.best_score_))) else : float_formatter = lambda x: "%.4f" %(x) print ("Feature importances: ", sorted(zip([float_formatter(x) for x in model.feature_importances_], feature_names), reverse=True)) return model.predict(test)
ridge_model_full_data = ridge.fit(X_train, y_train) #print('Fitting SVR...') #svr_model_full_data = svr.fit(X_train,y_train) print('Fitting GradientBoosting...') gbr_model_full_data = gbr.fit(X_train, y_train) print('Fitting XGBoost...') xgb_model_full_data = xgboost.fit(X_train, y_train) print('Fitting LightGBM...') lgb_model_full_data = lightgbm.fit(X_train, y_train) print('Fitting AdaBoost...') adaboost_model_full_data = adaboost.fit(X_train, y_train) #print('Fitting extratrees...') #extratrees_model_full_data = extratrees.fit(X_train,y_train) #print('Fitting Bagging...') #bagging_model_full_data = bagging.fit(X_train,y_train) print('Done fitting all models') # In[57]: #Blending model predictions print('Blending model predictions...') #def blend_models_predict(X): # return ((0.045 * elastic_model_full_data.predict(X))+(0.5 * lasso_model_full_data.predict(X))
#Store the accuracy results for each model in a dataframe for final comparison tempResultsDf = pd.DataFrame({'Model':['Bagging'],'Training_Score': acc_BG_train, 'Test_Score': acc_BG, 'K_Fold_Mean': kf_res_mean, 'K_Fold_Std': kf_res_std}) resultsDf = pd.concat([resultsDf, tempResultsDf]) resultsDf = resultsDf[['Model', 'Training_Score','Test_Score','K_Fold_Mean','K_Fold_Std']] resultsDf # # Ensemble Technique - AdaBoosting # In[201]: from sklearn.ensemble import AdaBoostRegressor abcl = AdaBoostRegressor(n_estimators=50,random_state=1) abcl = abcl.fit(X_train, y_train) # In[204]: y_predict = abcl.predict(X_test) abcl_train=abcl.score(X_train , y_train) print("Ada Boosting - Train Accuracy:",abcl_train) abcl_test = abcl.score(X_test , y_test) print("Ada Boosting - Test Accuracy:",abcl_test) results = cross_val_score(abcl, X, y, cv=kfold, scoring='r2') print(results) kf_res_mean=results.mean()*100.0 kf_res_std=results.std()*100.0
'linear', 'square', 'exponential', ] min_mean = 999999 minloss = '' min_n = 0 data_list = [] for loss in losstype: for n in range(100, 5000, 100): ada_1 = AdaBoostRegressor(n_estimators=n, loss=loss) ada_1.fit(X, Y) ysame = ada_1.predict(X) mean_diff = abs(Y - ysame).mean() print loss + ' ' + str(n) + ' ' + str(mean_diff) data_list.append([loss, n, mean_diff]) if (mean_diff < min_mean): min_mean = mean_diff minloss = loss min_n = n with open("ada_data.csv", "w") as f: writer = csv.writer(f) writer.writerows(data_list)
def price_predictions(ticker, start, end, forecast_out): file_path = symbol_to_path(ticker) df = pd.read_csv(file_path, index_col="<DTYYYYMMDD>", parse_dates=True, usecols=[ "<DTYYYYMMDD>", "<OpenFixed>", "<HighFixed>", "<LowFixed>", "<CloseFixed>", "<Volume>" ], na_values="nan") df = df.rename( columns={ '<DTYYYYMMDD>': 'Date', "<OpenFixed>": 'Open', '<HighFixed>': 'High', '<LowFixed>': 'Low', '<CloseFixed>': 'Close', '<Volume>': 'Volume' }) # columns order for backtrader type columnsOrder = ["Open", "High", "Low", "Close", "Volume", "OpenInterest"] # change the index by new index df = df.reindex(columns=columnsOrder) # change date index to increasing order df = df.sort_index() # take a part of dataframe df = df.loc[start:end] df['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0 df['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0 bbwindow = 25 vlwindow = 10 mmtum = 10 df['BB_Value'] = compute_indicator_bb(df, window=bbwindow) df['Volatility'] = compute_indicator_volatility(df, timeperiod=vlwindow) df['Momentum'] = talib.MOM(df['Close'].values, timeperiod=mmtum) df['OBV'] = talib.OBV(df['Close'].values, df['Volume'].values.astype(np.float64)) df['MACD'], _, _ = talib.MACD(df['Close'].values, fastperiod=12, slowperiod=26, signalperiod=9) _, df['STOCH'] = talib.STOCH(df['High'].values, df['Low'].values, df['Close'].values, fastk_period=14, slowk_period=1, slowd_period=5) df['MFI'] = talib.MFI(df['High'].values, df['Low'].values, df['Close'].values, df['Volume'].values.astype(np.float64), timeperiod=14) # df['EMA3'] = pd.Series(pd.Series.ewm(df['Close'], span = 3, min_periods = 3-1).mean()) # df['EMA6'] = pd.Series(pd.Series.ewm(df['Close'], span = 6, min_periods = 6-1).mean()) # df['EMA18'] = pd.Series(pd.Series.ewm(df['Close'], span = 18, min_periods = 18-1).mean()) df['PDI'] = talib.PLUS_DI(df['High'].values, df['Low'].values, df['Close'].values, timeperiod=14) df['NDI'] = talib.MINUS_DI(df['High'].values, df['Low'].values, df['Close'].values, timeperiod=14) # df = df[['Close', 'HL_PCT', 'PCT_change', 'Volume','BB_Value', # 'Volatility', 'Momentum', 'MACD', 'STOCH', 'MFI', 'OBV']] # df = df[['Close', 'HL_PCT', 'PCT_change', 'Volume', 'BB_Value']] df.fillna(method="ffill", inplace=True) df.fillna(method="backfill", inplace=True) forecast_col = 'Close' #inplace : boolean, default False # If True, fill in place. Note: this will modify any other views on this object, # (e.g. a no-copy slice for a column in a DataFrame). # Du bao 1% cua du lieu # Copy du lieu tu cot Adj. Close vao cot moi # Lenh Shift df['Target'] = df[forecast_col].shift(-forecast_out) # Lenh Drop loai bo label #axis : int or axis name: column # Whether to drop labels from the index (0 / ‘index’) or columns (1 / ‘columns’). X = np.array(df.drop(['Target'], 1)) y_true = df[forecast_col][-forecast_out:] # Preprocessing Input Data X = preprocessing.scale(X) #from sklearn.preprocessing import MinMaxScaler #scaler = MinMaxScaler() #X = scaler.fit_transform(X) # Tach gia tri X va X_lately ra khoi chuoi X_lately = X[-forecast_out:] X = X[:-forecast_out] # Loai bo cac gia tri NA # df.dropna(inplace=True) # Target la vector y lay tu cot label y = np.array(df['Target'].dropna()) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #X_train, X_test, y_train, y_test = train_test_split(X, y) #from sklearn.preprocessing import MinMaxScaler #from sklearn.preprocessing import StandardScaler #scaler = MinMaxScaler() #scaler = StandardScaler() #X_train = scaler.fit_transform(X_train) #X_test = scaler.transform(X_test) #X_lately = scaler.transform(X_lately) n_neighbors = 5 knn = neighbors.KNeighborsRegressor(n_neighbors, weights='uniform') knn.fit(X_train, y_train) print('Train score KNN: ', knn.score(X_train, y_train), 'Test score KNN : ', knn.score(X_test, y_test)) forecast_set = knn.predict(X_lately) print('Price for next {} days'.format(forecast_out), forecast_set) bagging = BaggingRegressor(DecisionTreeRegressor(), n_estimators=50, random_state=50) bagging.fit(X_train, y_train) print('Train score BAG: ', bagging.score(X_train, y_train), 'Test score BAG : ', bagging.score(X_test, y_test)) forecast_set = bagging.predict(X_lately) print('Price for next {} days'.format(forecast_out), forecast_set) rf = RandomForestRegressor(n_estimators=50, random_state=50) rf.fit(X_train, y_train) print('Train score RF: ', rf.score(X_train, y_train), 'Test score RF : ', rf.score(X_test, y_test)) forecast_set = rf.predict(X_lately) print('Price for next {} days'.format(forecast_out), forecast_set) adaboost = AdaBoostRegressor(neighbors.KNeighborsRegressor(n_neighbors=5), n_estimators=30, random_state=0) #adaboost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), # n_estimators=30, random_state=0) adaboost.fit(X_train, y_train) print('Train score Ada: ', adaboost.score(X_train, y_train), 'Test score Ada : ', adaboost.score(X_test, y_test)) forecast_set = adaboost.predict(X_lately) print('Price for next {} days'.format(forecast_out), forecast_set)
X, y = shuffle(housing_data.data, housing_data.target, random_state=7) # Split the data 80/20 (80% for training, 20% for testing) num_training = int(0.8 * len(X)) X_train, y_train = X[:num_training], y[:num_training] X_test, y_test = X[num_training:], y[num_training:] # Fit decision tree regression model dt_regressor = DecisionTreeRegressor(max_depth=4) dt_regressor.fit(X_train, y_train) # Fit decision tree regression model with AdaBoost ab_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=400, random_state=7) ab_regressor.fit(X_train, y_train) # Evaluate performance of Decision Tree regressor y_pred_dt = dt_regressor.predict(X_test) mse = mean_squared_error(y_test, y_pred_dt) evs = explained_variance_score(y_test, y_pred_dt) print("\n#### Decision Tree performance ####") print("Mean squared error =", round(mse, 2)) # Evaluate performance of AdaBoost y_pred_ab = ab_regressor.predict(X_test) mse = mean_squared_error(y_test, y_pred_ab) evs = explained_variance_score(y_test, y_pred_ab) print("\n#### AdaBoost performance ####") print("Mean squared error =", round(mse, 2)) print("Explained variance score =", round(evs, 2))
0: -1] #names will be replaced by features directly taken from user selection X = data[feature_cols] y = data[names[-1]] #names replaced by target taken from user selection #print(X.shape) #print(y.shape) '''#preprocessing output in integers le = preprocessing.LabelEncoder() le.fit(y) Encoded_classes = list(le.classes_) y = list(map(int, le.transform(y)))''' validation_size = 0.20 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=validation_size, random_state=10) # Instantiate abc = AdaBoostRegressor() # Fit abc.fit(X_train, y_train) # Predict y_pred = abc.predict(X_test) accuracy = np.sqrt(metrics.mean_squared_error(y_test, y_pred)) print(accuracy)
import pickle df = pd.read_csv('hdf_denorm.csv') ## preprocess #df.rename(columns={'instant':'rec_id', #'dteday':'datetime', #'holiday':'is_holiday', #'workingday':'is_workingday', #'weathersit':'weather_condition', #'hum':'humidity', #'atemp':'felt_temperature', #'mnth':'month', #'cnt':'total_count', #'hr':'hour', #'yr':'year'},inplace=True) df = df[['hour','is_holiday', 'weekday','felt_temperature_actual','humidity_actual','users_total']] #df.is_holiday = df.is_holiday.astype('category') #df.weekday = df.weekday.astype('category') df = pd.get_dummies(df) ## modelling x = df.drop(columns = ['users_total']) y = df['users_total'] ada = AdaBoostRegressor() ada.fit(x,y) pickle.dump(ada, open('my_model.pkl','wb'))
max_depth=8, min_samples_leaf=4, random_state=2) #scoring(gbr) gbr = gbr.fit(X_train, y_train) gbr_accuracy = evaluate(gbr, X_test, y_test) dtr = DecisionTreeRegressor(min_samples_leaf=3, max_depth=8, random_state=2) dtr = dtr.fit(X_train, y_train) dtr_accuracy = evaluate(dtr, X_test, y_test) abr = AdaBoostRegressor(n_estimators=100, learning_rate=0.1, loss='linear', random_state=2) abr = abr.fit(X_train, y_train) abr_accuracy = evaluate(abr, X_test, y_test) def plot_importances(model, model_name): importances = model.feature_importances_ std = np.std([model.feature_importances_ for feature in model.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Plot the feature importances of the forest plt.figure(figsize=(8, 5)) plt.title("Feature importances of " + model_name) plt.bar(range(X_train.shape[1]), importances[indices], color="r",
n_estimators=n_estimators, sample_size=sample_size, steps=steps, fold=fold, random_state=random_state) print(X) print(type(X)) #regr_1.fit(X, y) y_pred1 = regr_1.predict(x_target_test) # 4.3 As comparision, use AdaBoostR2 without transfer learning #============================================================================== regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators=n_estimators) #============================================================================== regr_2.fit(x_target_train, y_target_train) y_pred2 = regr_2.predict(x_target_test) # 4.4 Plot the results plt.figure() plt.scatter(x_target_train, y_target_train, c="k", label="target_train") plt.plot(x_target_test, y_target_test, c="b", label="target_test", linewidth=0.5) plt.plot(x_target_test, y_pred1, c="r", label="TwoStageTrAdaBoostR2", linewidth=2)
def test_regression_toy(): # Check classification on a toy dataset. clf = AdaBoostRegressor(random_state=0) clf.fit(X, y_regr) assert_array_equal(clf.predict(T), y_t_regr)
print('SVR') mod_SVR = model_validation(dat, SVR(), param_RF, t_s=test_size) #%% ''' Standardize ''' scaler = StandardScaler() df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns) ''' Regression ''' # Use numpy array from now on data = df.values[:100] new_data = df.values[100:] X = data[:, 1:] y = data[:, 0] X_new = new_data[:, 1:] model = AdaBoostRegressor(n_estimators=50, learning_rate=0.1) fitter = model.fit(X_train, y_train) # Use all the data from case1Data predict = fitter.predict(X_new) # Use data from Case1Data.txt
def cal_MSE_firm_onedate_svr(start_date_dt,firm_macro_2,firstdate): h=5 c = 1 beforeYears=3 n_steps = 5 # the length of X data n_inputs_1 = 10 # the number of variables n_inputs_2 = 7 num_layers_0=30 keep_prob=0.5 #n_neurons = 10 n_outputs = 1 learning_rate = 0.0001 n_epochs = 400 # global numpy_array_1_1 # global numpy_array_bias_1_1 # global numpy_array_1_2 # global numpy_array_bias_1_2 # global numpy_array_2_1 # global numpy_array_bias_2_1 # global numpy_array_2_2 # global numpy_array_bias_2_2 back_date = (start_date_dt-relativedelta(years=beforeYears,months=0,days=0)).strftime('%Y-%m-%d') start_date=start_date_dt.strftime('%Y-%m-%d') firm_hist_t = firm_macro_2[back_date:start_date] xydata=firm_hist_t #xydata_train =xydata.dropna() xydata=xydata.dropna(subset=filter(lambda x: x not in ['VaR_sp500','VaR_sse','VaR_hsi', 'VaR_sti'],xydata.columns)) xydata_train =xydata # xdata = xydata_train[['VOL', 'RET', 'X3T_change', 'change_slope', 'ted', 'cre_spread', # 'STI','re_excess','equ_vol', 'VOL_2','RET_2','X3T_change_2','change_slope_2','ted_2','cre_spread_2','STI_2', # 're_excess_2','equ_vol_2','Rsysh']].iloc[:-5,:] xdata = xydata_train[['VOL', 'RET', 'HK_equ_vol', 'HIBOR', 'VOL_2', 'HIBOR_2','HK_equ_vol_2','Rhsih','Rhsi_t']].iloc[:-5,:] std_scale_x = preprocessing.StandardScaler().fit(xdata) xdata = std_scale_x.transform(xdata) ydata = xydata_train[['Rjh']].iloc[:-5,:] std_scale_y = preprocessing.StandardScaler().fit(ydata) ydata = std_scale_y.transform(ydata) # xdata_predict = xydata[['VOL', 'RET', 'X3T_change', 'change_slope', 'ted', 'cre_spread', # 'STI','re_excess','equ_vol', 'VOL_2','RET_2','X3T_change_2','change_slope_2','ted_2','cre_spread_2','STI_2', # 're_excess_2','equ_vol_2','Rsysh','Varh']].iloc[xydata.shape[0]-1:xydata.shape[0],:] xdata_predict = xydata[['VOL', 'RET', 'HK_equ_vol', 'HIBOR', 'VOL_2', 'HIBOR_2','HK_equ_vol_2','Rhsih','Rhsi_t', 'VaR_hsi']].iloc[xydata.shape[0]-1:xydata.shape[0],:] xdata_predict2 = xydata[['VOL', 'RET', 'HK_equ_vol', 'HIBOR', 'VOL_2', 'HIBOR_2','HK_equ_vol_2','Rhsih', 'Rhsi_t']].iloc[xydata.shape[0]-1:xydata.shape[0],:] y2 = xydata[['Rjh']].iloc[xydata.shape[0]-1:xydata.shape[0],:] if not math.isnan(xdata_predict.VaR_hsi): xdata_predict.Rhsih=xdata_predict.VaR_hsi # if not math.isnan(xdata_predict.VaR_hsi): # xdata_predict.Rhsih=xdata_predict.VaR_hsi #drop column Varh xdata_predict=xdata_predict.drop(labels=['VaR_hsi'], axis=1) xdata_predict = std_scale_x.transform(xdata_predict) xdata_predict2 = std_scale_x.transform(xdata_predict2) #xdata_predict = xdata_predict.reshape(1,n_steps,xdata.shape[2]) xdata_startdate = xdata_predict np.random.seed(0) regr = AdaBoostRegressor(random_state=0, n_estimators=100) regr.fit(xdata,ydata.ravel()) MES_startdate=regr.predict(xdata_startdate) y_predict=regr.predict(xdata_predict2) MES_startdate = std_scale_y.inverse_transform(MES_startdate) print(MES_startdate) y_predict = std_scale_y.inverse_transform(y_predict) loss_y = pow((y2.iat[0,0]-y_predict[0]),2) print(loss_y) if MES_startdate[0]<-1: MES_startdate[0]=-1 #if MES_startdate[0,0]>0: # MES_startdate[0,0]=0 return (-MES_startdate[0],loss_y)
def train_adboost_cart(data, avg={}): test_X, test_Y = load_data(data, avg) adaboost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), loss="square", learning_rate=0.01, n_estimators=500) adaboost.fit(test_X, test_Y) return adaboost
# 학습모델 구축을 위해 data형식을 Vector로 변환 AB_X1 = AB_m_Inputdata.values AB_Y1 = AB_m_Outputdata.values # Training Data, Test Data 분리 AB_X1_train, AB_X1_test, AB_Y1_train, AB_Y1_test = train_test_split( AB_X1, AB_Y1, test_size=0.33, random_state=42) ######################################################################################################################## # AdaBoost 학습 모델 구축 making_adaboost_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=10), n_estimators=100, learning_rate=0.5, random_state=42) making_adaboost_model.fit(AB_X1_train, AB_Y1_train) AB_m_predicted = making_adaboost_model.predict(AB_X1_test) # [1,n]에서 [n,1]로 배열을 바꿔주는 과정을 추가 AB_length_x1test = len(AB_X1_test) AB_m_predicted = AB_m_predicted.reshape(AB_length_x1test, 1) # 학습 모델 성능 확인 AB_m_mae = abs(AB_m_predicted - AB_Y1_test).mean(axis=0) AB_m_mape = (np.abs((AB_m_predicted - AB_Y1_test) / AB_Y1_test).mean(axis=0)) AB_m_rmse = np.sqrt(((AB_m_predicted - AB_Y1_test)**2).mean(axis=0)) AB_m_rmsle = np.sqrt( (((np.log(AB_m_predicted + 1) - np.log(AB_Y1_test + 1))**2).mean(axis=0))) print(AB_m_mae)
def AdaBoost(train_features, test_feat, train_labels): clf = AdaBoostRegressor() clf.fit(train_features, train_labels) pred_test_labels = clf.predict(test_feat) return [pred_test_labels, clf]
print(u'score 准确率为 %.4lf' % acc_decision_tree) # K 折交叉验证统计决策树准确率 print(u'cross_val_score 准确率为 %.4lf' % np.mean(cross_val_score(clf, train_features, train_labels, cv=10))) # 房价预测 # 加载数据 data = load_boston() # 分割数据 train_x, test_x, train_y, test_y = train_test_split(data.data, data.target, test_size=0.25, random_state=33) # 使用 AdaBoost 回归模型 regressor = AdaBoostRegressor() regressor.fit(train_x, train_y) pred_y = regressor.predict(test_x) mse = mean_squared_error(test_y, pred_y) print(" 房价预测结果 ", pred_y) print(" 均方误差 = ", round(mse, 2)) # 使用决策树回归模型 dec_regressor = DecisionTreeRegressor() dec_regressor.fit(train_x, train_y) pred_y = dec_regressor.predict(test_x) mse = mean_squared_error(test_y, pred_y) print(" 决策树均方误差 = ", round(mse, 2)) # 使用 KNN 回归模型 knn_regressor = KNeighborsRegressor() knn_regressor.fit(train_x, train_y) pred_y = knn_regressor.predict(test_x)
from sklearn.tree import DecisionTreeRegressor # Create the dataset rng = np.random.RandomState(1) X = np.linspace(0, 6, 100)[:, np.newaxis] y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0]) # dataArr, labelArr = loadDataSet("7. AdaBoost/horseColicTraining2.txt") # Fit regression model regr_1 = DecisionTreeRegressor(max_depth=4) regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=rng) regr_1.fit(X, y) regr_2.fit(X, y) # Predict y_1 = regr_1.predict(X) y_2 = regr_2.predict(X) # Plot the results plt.figure() plt.scatter(X, y, c="k", label="training samples") plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2) plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2) plt.xlabel("data") plt.ylabel("target") plt.title("Boosted Decision Tree Regression") plt.legend() plt.show()
ax.set_xlabel('Measured') ax.set_ylabel('Predicted') ax.legend(["XG Boost Regression"]) plt.show() #Fitting AdaBoostRegressor min = 1000 for dep in [10, 15, 18, 25]: for esti in [550, 575, 600]: for lr in [0.01, 0.3, 1.25, 1.5]: regr_ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=dep), n_estimators=esti, random_state=0, learning_rate=lr, loss="exponential") regr_ada.fit(X_train, Y_train) ada_pred = regr_ada.predict(X_CV) RMLSE = np.sqrt(mean_squared_log_error(Y_CV, ada_pred)) #print ("Error (AdaBoostRegressor)=",RMLSE," for depth=",dep," for estimators=",esti," and learning rate=",lr) if (min > RMLSE): min = RMLSE lr_f = lr esti_f = esti dep_f = dep print( "Root Mean Square Logarithmic Cross Validation Error (AdaBoostRegressor)=", RMLSE, " for depth=", dep, " for estimators=", esti, " and learning rate=", lr) regr_ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=dep_f), n_estimators=esti_f,
wderrn = np.array(errn)[np.array(errn) <= 20] wderrn = wderrn[wderrn >= -20] wderrn.size / len(errn) np.median(wderrn) np.mean(wderrn) plt.figure() plt.plot(wderrn) x = np.linspace(0, 8760, num=8760)[:, np.newaxis] y = nord['FABBISOGNO REALE'].ix[nord.index.year == 2015].values.ravel() regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=24), n_estimators=3000) regr.fit(x, y) yhat = regr.predict(x) plt.figure() plt.plot(yhat, color='blue', marker='o') plt.plot(y, color='red') plt.figure() plt.plot(y - yhat) #### fabbisogno 2009 sbil2009 = pd.read_excel( 'C:/Users/utente/Documents/misure/aggregato_sbilanciamento2009.xlsx') nord2009 = sbil2009.ix[sbil2009['CODICE RUC'] == 'UC_DP1608_NORD'] nord2009.index = pd.date_range('2009-01-01', '2010-01-02', freq='H')[:nord2009.shape[0]]
dt.predict(X[:10]) print("profiling...") txt = profile(runlocaldt, pyinst_format='text') print(txt[1]) ########################################### # Profiling for AdaBoostRegressor # +++++++++++++++++++++++++++++++ # # The next example shows how long the python runtime # spends in each operator. ada = AdaBoostRegressor() ada.fit(X, y) onx = to_onnx(ada, X[:1].astype(numpy.float32), target_opset=11) oinf = OnnxInference(onx, runtime='python_compiled') print(oinf) ######################################## # The profiling. def runlocal(): for i in range(0, 500): oinf.run({'X': X32}) print("profiling...") txt = profile(runlocal, pyinst_format='text')
model_parameter = "adaboost_lr0p1_lossSquare_nest1000_minsamplesplit2_maxdepth5_sqrt_random_skipdatacolumn012" # In[ ]: #estimator for adaboost ada_tree_estimator = DecisionTreeRegressor(min_samples_split=2, max_depth=5, max_features='sqrt', splitter='random') #adaboost regressor ab = AdaBoostRegressor(ada_tree_estimator, learning_rate=0.1, loss='square', n_estimators=1000) #fit ab.fit(X_train, Y_train) # ## Validation # In[ ]: def visualize_loss_curves(model, X_train, Y_train, X_test, Y_test, cut): plt.close('all') Y_train_pred = np.zeros_like(Y_train) Y_test_pred = np.zeros_like(Y_test) losses_train = [] losses_test = [] # For each added classifier, store the new training and test losses.
clf_1 = ensemble.GradientBoostingClassifier() clf_2 = AdaBoostRegressor(ensemble.GradientBoostingClassifier(), n_estimators=50, random_state=None) # CV Loop for train_index, test_index in kf: # for each iteration of the for loop we'll do a test train split X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] t = StandardScaler() X_train = t.fit_transform(X_train) clf_1.fit(X_train, y_train) # Train clf_1 on the training data clf_2.fit(X_train, y_train) # Train clf_2 on the training data X_test = t.transform(X_test) y_pred1[test_index] = clf_1.predict( X_test) # Predict clf_1 using the test and store in y_pred y_pred2[test_index] = clf_2.predict( X_test) # Predict clf_2 using the test and store in y_pred plot_r2(y, y_pred1, "Performance of CV DecisionTreeRegressor") plt.show() r2_score(y, y_pred1) rmse = sqrt(mean_squared_error(y, y_pred1)) print("GradientBoostingClassifier CV 1 rmse: ", rmse) plot_r2(y, y_pred2, "Performance of CV AdaBoost")
# --> 0.37737 r2_score(y_test, y_pred) # --> 0.62263 ############################################################ ## Decision tree regression with AdaBoost from sklearn.ensemble import AdaBoostRegressor regressor2 = AdaBoostRegressor(DecisionTreeRegressor( criterion='mse', max_depth=5, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, splitter='best'), n_estimators=500, random_state=42) regressor2.fit(X_train, y_train) # predict y_pred = regressor2.predict(X_test) MSE = mean_squared_error(y_test, y_pred) # --> 0.37569 r2_score(y_test, y_pred) # --> 0.6243
#mse in $ mse = mean_absolute_error(y_test, y_pred) print("The mean absolute error is:$", mse) #chceking r^2 from sklearn.metrics import r2_score print("r_Score:", r2_score(y_test, y_pred)) bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10) bg.fit(X_train, y_train) bg.score(X_train, y_train) bg.score(X_test, y_test) #Adaboosting regr = AdaBoostRegressor() regr.fit(X_train, y_train) regr.score(X_test, y_test) #Decision from sklearn.tree import DecisionTreeRegressor dt = DecisionTreeRegressor() dt.fit(X_train, y_train) dt.score(X_test, y_test) #gradientBoost from sklearn.ensemble import GradientBoostingRegressor gb = GradientBoostingRegressor() gb.fit(X_train, y_train) gb.score(X_train, y_train) gb.score(X_test, y_test)
svr = SVR(kernel='rbf') svr_fit = svr.fit(Rtrain_X, Rtrain_y) # 預測 #svr = svr.predict(train_X) svr_test_y_predicted = svr_fit.predict(Rtest_X) # 績效 Train_r2 = r2_score(Rtrain_y, svr_fit.predict(Rtrain_X)) print('Train R2: ', Train_r2) PCCs = np.corrcoef(svr_test_y_predicted, Rtest_y) RMSE = (mean_squared_error(Rtest_y, svr_test_y_predicted))**(1 / 2) R_squared = r2_score(Rtest_y, svr_test_y_predicted) print('r2:', R_squared) print(PCCs) print(RMSE) #__________________________________________________________________ '''Adaboost Regression''' from sklearn.ensemble import AdaBoostRegressor abtR = AdaBoostRegressor() #n_estimators=1000) abtR.fit(Rtrain_X, Rtrain_y) abtR_predicted = abtR.predict(Rtest_X) abtR_PCCs = np.corrcoef(abtR_predicted, Rtest_y) abtR_RMSE = (mean_squared_error(Rtest_y, abtR_predicted))**(1 / 2) abtR_R_squared = r2_score(Rtest_y, abtR_predicted) print('TRAIN SCORE: ', abtR.score(Rtrain_X, Rtrain_y), ' TEST SCORE: ', abtR.score(Rtest_X, Rtest_y), '\n') print('#_____________________________________________________', '\n') #__________________________________________________________________
h_GbrModel = GradientBoostingRegressor() h_rdModel = RandomForestRegressor() h_sgdModel = SGDRegressor() h_elnModel = ElasticNet() x_train, x_test, y_train, y_test = model_selection.train_test_split(h_data.data,h_data.target, test_size = 0.3) #h_normalizer = Normalizer() h_scaler = MinMaxScaler() #h_data.data = h_normalizer.fit_transform(h_data.data) h_scaler.fit(x_train) h_scaler.transform(x_test) h_LnModel.fit(x_train,y_train) h_SVRModel.fit(x_train,y_train) h_nnModel.fit(x_train,y_train) h_adaModel.fit(x_train,y_train) h_GbrModel.fit(x_train,y_train) h_rdModel.fit(x_train,y_train) h_sgdModel.fit(x_train,y_train) h_elnModel.fit(x_train,y_train) print(metrics.r2_score(h_LnModel.predict(x_test),y_test)) print(metrics.r2_score(h_SVRModel.predict(x_test),y_test)) print(metrics.r2_score(h_nnModel.predict(x_test),y_test)) print(metrics.r2_score(h_adaModel.predict(x_test),y_test)) print(metrics.r2_score(h_GbrModel.predict(x_test),y_test)) print(metrics.r2_score(h_rdModel.predict(x_test),y_test)) print(metrics.r2_score(h_sgdModel.predict(x_test),y_test)) print(metrics.r2_score(h_elnModel.predict(x_test),y_test))
from sklearn.model_selection import train_test_split X_dev,X_eval, y_dev,y_eval = train_test_split(X, y, test_size=0.33, random_state=42) X_train,X_test, y_train,y_test = train_test_split(X_dev, y_dev, test_size=0.33, random_state=492) from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostRegressor from sklearn.ensemble import GradientBoostingRegressor rng = np.random.RandomState(1) bdt = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=rng) bdt.fit(X_train, y_train) print('BDT fitted') import pickle pickle.dump(bdt,open("bdt.joblib","wb")) bdt = pickle.load(open("bdt.joblib","rb")) y_predicted = bdt.predict(X) y_predicted.dtype = [('Trigger_correction', 'float64')] print(type(y_predicted)) print(len(y_predicted)) from root_numpy import array2root
from sklearn.ensemble import AdaBoostRegressor train = pd.read_csv('parkinsons_train.csv') test = pd.read_csv('parkinsons_test.csv') features = ["MDVP:Fo(Hz)", "MDVP:Fhi(Hz)", "MDVP:Flo(Hz)", "MDVP:Jitter(%)", "MDVP:Jitter(Abs)", "MDVP:RAP", "MDVP:PPQ", "Jitter:DDP", "MDVP:Shimmer", "MDVP:Shimmer(dB)", "Shimmer:APQ3", "Shimmer:APQ5", "MDVP:APQ", "Shimmer:DDA", "NHR", "HNR", "RPDE", "DFA", "spread1", "spread2", "D2", "PPE"] # 64% X = train[features] y = train['status'] temp = test['status'] i = 1 j = 1 results = [] regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=100), n_estimators=100, random_state=) regr.fit(X,y) ''' for i,j in range(1000): acc = accuracy_score(regr.predict(test[jitter], y)) if results.length() == 0 or acc > results[2]: results = [i, j, acc] print('Use max_depth: ' + results[0] + ' , n_estimators: ' + results[1] + ' to get a maximum accuracy score of ' + results[2]) i = results[0] j = results[1] ''' pred5 = regr.predict(test[jitter]) print(str(accuracy_score(pred5, temp)))
X, y = shuffle(boston.data, boston.target) offset = int(0.7*len(X)) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] # We will vary the number of base learners from 2 to 300 max_learners = arange(2, 300) train_err = zeros(len(max_learners)) test_err = zeros(len(max_learners)) for i, l in enumerate(max_learners): # Set up a Adaboost Regression Learner with l base learners regressor = AdaBoostRegressor(n_estimators=l) # Fit the learner to the training data regressor.fit(X_train, y_train) # Find the MSE on the training set train_err[i] = mean_squared_error(y_train, regressor.predict(X_train)) # Find the MSE on the testing set test_err[i] = mean_squared_error(y_test, regressor.predict(X_test)) # Plot training and test error as a function of the number of base learners pl.figure() pl.title('Boosting: Performance vs Number of Learners') pl.plot(max_learners, test_err, lw=2, label = 'test error') pl.plot(max_learners, train_err, lw=2, label = 'training error') pl.legend() pl.xlabel('Number of Learners') pl.ylabel('RMS Error') pl.show()
def def_AdaBoostRegressor(self, estimators): abr_ = AdaBoostRegressor(n_estimators=estimators) abr_.fit(self.Xtrain, self.Ytrain) pred = abr_.predict(self.Xtest) return pred
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, spredictions))) coeffecients = pd.DataFrame(sgd.coef_,X.columns) coeffecients.columns = ['Coeffecient'] #print(coeffecients) #ADABOOST from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101) from sklearn.ensemble import AdaBoostRegressor abreg = AdaBoostRegressor(random_state=0, n_estimators=100) abreg.fit(X_train,y_train) abpredictions = abreg.predict( X_test) #print(abpredictions) plt.scatter(y_test,abpredictions) plt.title('ADABOOST') plt.xlabel('Y Test') plt.ylabel('Predicted Y') plt.show() from sklearn import metrics print('MAE:', metrics.mean_absolute_error(y_test, abpredictions)) print('MSE:', metrics.mean_squared_error(y_test, abpredictions)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, abpredictions)))
def decision_tree(X, y1, y2, y3): n, _ = X.shape nTrain = int(0.5 * n) #training on 50% of the data Xtrain = X[:nTrain, :] ytrain = y1[:nTrain] ytrain_registered = y2[:nTrain] ytest_registered = y2[nTrain:] ytrain_casual = y3[:nTrain] ytest_casual = y3[nTrain:] Xtest = X[nTrain:, :] ytest = y1[nTrain:] #regular clf_1 = DecisionTreeRegressor(max_depth=None) clf_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None), n_estimators=500) clf_4 = RandomForestRegressor(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0) clf_5 = ExtraTreesRegressor(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0) clf_3 = GradientBoostingRegressor(n_estimators=500, max_depth=None, random_state=0) print "finished generating tree" clf_1.fit(Xtrain, ytrain_registered) clf_2.fit(Xtrain, ytrain_registered) clf_3.fit(Xtrain, ytrain_registered) clf_4.fit(Xtrain, ytrain_registered) clf_5.fit(Xtrain, ytrain_registered) print 'Finished fitting' dt_regular = clf_1.predict(Xtest) ada_regular = clf_2.predict(Xtest) grad_regular = clf_3.predict(Xtest) rf_regular = clf_4.predict(Xtest) et_regular = clf_5.predict(Xtest) #casual print "finished generating tree" clf_1.fit(Xtrain, ytrain_casual) clf_2.fit(Xtrain, ytrain_casual) clf_3.fit(Xtrain, ytrain_casual) clf_4.fit(Xtrain, ytrain_casual) clf_5.fit(Xtrain, ytrain_casual) print 'Finished fitting' dt_casual = clf_1.predict(Xtest) ada_casual = clf_2.predict(Xtest) grad_casual = clf_3.predict(Xtest) rf_casual = clf_4.predict(Xtest) et_casual = clf_5.predict(Xtest) feature_imps = clf_4.feature_importances_ print "regular decision tree" print rmsle(ytest, dt_regular + dt_casual) print "boosted decision tree" print rmsle(ytest, ada_regular + ada_casual) print "gradient tree boosting" print rmsle(ytest, grad_regular + grad_casual) print "random forest classifier" print rmsle(ytest, rf_regular + rf_casual) print "extra trees classifier" print rmsle(ytest, et_casual + et_regular) print "feature importances" print feature_imps