class Regressor(BaseEstimator): def __init__(self): self.clf = AdaBoostRegressor(RandomForestRegressor(n_estimators=100, max_depth=40, max_features=25), n_estimators=100) #self.clf_Boost = GradientBoostingRegressor( n_estimators = 500 , max_features = 20 ) #self.clf_Regression = LinearRegression() def fit(self, X, y): self.clf.fit(X,y) def predict(self, X): return self.clf.predict(X)
def backTest(trainEndDate, code, testDate, predictDate): conn = db.get_history_data_db('D') df = None # train more date # model = pickle.load(open('%s/%s.pkl' % (config.model_dir, code), 'r')) rng = np.random.RandomState(1) model = AdaBoostRegressor(DecisionTreeRegressor( max_depth=4), n_estimators=1000, random_state=rng, loss='square') df = pd.read_sql_query( "select * from history_data where date([date])<='%s' and code='%s' order by code, date([date]) asc" % ( trainEndDate, code), conn) shift_1 = df['close'].shift(-2) df['target'] = shift_1 data = df[df['target'] > -1000] X_train = data.ix[:, 'code':'turnover'] y_train = data.ix[:, 'target'] if len(X_train) < 500: return print len(X_train) # print data # for i in range(0, 10): # model.fit(X_train, y_train) model.fit(X_train, y_train) # predict tomorrow try: df = pd.read_sql_query(config.sql_history_data_by_code_date % (code, testDate), conn) # print df except Exception, e: print e
def round2(X_df, featurelist): # Set parameters model = AdaBoostRegressor() y_df = X_df['target'] n = len(y_df) # Perform 5-fold cross validation scores = [] kf = KFold(n, n_folds=5, shuffle=True) # Calculate mean absolute deviation for train/test for each fold for train_idx, test_idx in kf: X_train, X_test = X_df.iloc[train_idx, :], X_df.iloc[test_idx, :] # y_train, y_test = y_df[train_idx], y_df[test_idx] X_train, X_test = applyFeatures(X_train, X_test, featurelist) Xtrain_array, ytrain_array, Xtest_array, ytest_array = dfToArray(X_train, X_test) model.fit(Xtrain_array, ytrain_array) prediction = model.predict(Xtest_array) rmse = np.sqrt(mean_squared_error(ytest_array, prediction)) scores.append(rmse) print rmse print "Finish fold" return scores
def fit(self, start_date, end_date): for ticker in self.tickers: self.stocks[ticker] = Stock(ticker) params_ada = [{ 'n_estimators': [25, 50, 100], 'learning_rate': [0.01, 0.1, 1, 10], 'loss': ['linear', 'square', 'exponential'] }] params = ParameterGrid(params_ada) # Find the split for training and CV mid_date = train_test_split(start_date, end_date) for ticker, stock in self.stocks.items(): X_train, y_train = stock.get_data(start_date, mid_date, fit=True) # X_train = self.pca.fit_transform(X_train.values) X_train = X_train.values # pdb.set_trace() X_cv, y_cv = stock.get_data(mid_date, end_date) # X_cv = self.pca.transform(X_cv.values) X_cv = X_cv.values lowest_mse = np.inf for i, param in enumerate(params): ada = AdaBoostRegressor(**param) ada.fit(X_train, y_train.values) mse = mean_squared_error( y_cv, ada.predict(X_cv)) if mse <= lowest_mse: self.models[ticker] = ada return self
def predict(tour_data): vec = DictVectorizer() tour_data = get_tour_data() transformed = vec.fit_transform(tour_data).toarray() categories = vec.get_feature_names() y = transformed[:,[categories.index('rating')]] X = transformed[:,np.arange(transformed.shape[1])!=categories.index('rating')] reg_tree = DecisionTreeRegressor() addboost_tree = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=rng) red_tree.fit(X,y) addboost_tree(X,y) # Predict y_1 = red_tree.predict(X) y_2 = addboost_tree.predict(X) return prediction
def ada_boost_regressor(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True): """ :param train_x: train :param train_y: text :param pred_x: test set to predict :param review_id: takes in a review id :param v_curve: run the model for validation curve :param l_curve: run the model for learning curve :param get_model: run the model :return: the predicted values,learning curve, validation curve """ ada = AdaBoostRegressor(n_estimators=5) if get_model: print "Fitting Ada..." ada.fit(train_x, np.log(train_y+1)) ada_pred = np.exp(ada.predict(pred_x))-1 Votes = ada_pred[:,np.newaxis] Id = np.array(review_id)[:,np.newaxis] # create submission csv for Kaggle submission_ada= np.concatenate((Id,Votes),axis=1) np.savetxt("submission_ada.csv", submission_ada,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='') # plot validation and learning curves if l_curve: print "Working on Learning Curves" plot_learning_curve(AdaBoostRegressor(), "Learning curve: Adaboost", train_x, np.log(train_y+1.0)) if v_curve: print "Working on Validation Curves" plot_validation_curve(AdaBoostRegressor(), "Validation Curve: Adaboost", train_x, np.log(train_y+1.0), param_name="n_estimators", param_range=[2, 5, 10, 15, 20, 25, 30])
def Round2(X, y): # Set parameters min_score = {} for loss in ['linear', 'square', 'exponential']: model = AdaBoostRegressor(loss=loss) n = len(y) # Perform 5-fold cross validation scores = [] kf = KFold(n, n_folds=5, shuffle=True) # Calculate mean absolute deviation for train/test for each fold for train_idx, test_idx in kf: X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] model.fit(X_train, y_train) prediction = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, prediction)) # score = model.score(X_test, y_test) scores.append(rmse) if len(min_score) == 0: min_score['loss'] = loss min_score['scores'] = scores else: if np.mean(scores) < np.mean(min_score['scores']): min_score['loss'] = loss min_score['scores'] = scores print "Loss:", loss print scores print np.mean(scores) return min_score
def main(): ab = AdaBoostRegressor(base_estimator=None, n_estimators=50, learning_rate=1.0, loss='exponential', random_state=None) ab.fit(X_train, y_train) #Evaluation in train set #Evaluation in train set pred_proba_train = ab.predict(X_train) mse_train = mean_squared_error(y_train, pred_proba_train) rmse_train = np.sqrt(mse_train) logloss_train = log_loss(y_train, pred_proba_train) #Evaluation in validation set pred_proba_val = ab.predict(X_val) mse_val = mean_squared_error(y_val, pred_proba_val) rmse_val = np.sqrt(mse_val) logloss_val = log_loss(y_val, pred_proba_val) rmse_train rmse_val logloss_train logloss_val
def train_learning_model_decision_tree_ada_boost(df): #code taken from sklearn X_all, y_all = preprocess_data(df) X_train, X_test, y_train, y_test = split_data(X_all, y_all) tree_regressor = DecisionTreeRegressor(max_depth = 6) ada_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators = 500, learning_rate = 0.01, random_state = 1) tree_regressor.fit(X_train, y_train) ada_regressor.fit(X_train, y_train) y_pred_tree = tree_regressor.predict(X_test) y_pred_ada = ada_regressor.predict(X_test) mse_tree = mean_squared_error(y_test, y_pred_tree) mse_ada = mean_squared_error(y_test, y_pred_ada) mse_tree_train = mean_squared_error(y_train, tree_regressor.predict(X_train)) mse_ada_train = mean_squared_error(y_train, ada_regressor.predict(X_train)) print ("MSE tree: %.4f " %mse_tree) print ("MSE ada: %.4f " %mse_ada) print ("MSE tree train: %.4f " %mse_tree_train) print ("MSE ada train: %.4f " %mse_ada_train)
def predict_volatility_1year_ahead(rows, day, num_days): """ SUMMARY: Predict volatility 1 year into the future ALGORITHM: a) The predictor will train on all data up to exactly 1 year (252 trading days) before `day` b) The newest 10 days up to and including `day` will be used as the feature vector for the prediction i.e. if day = 0, then the feature vector for prediction will consist of days (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) if day = 10, then the feature vector for predictor input will be days (10, 11, 12, 13, 14, 15, 16, 17, 19) INPUT: minimum of (1 year + 10 days) of data before `day` (newest data is day=0) """ '''enforce that `day` is in the required range''' assert len(rows) >= 252+num_days + day, 'You need to have AT LEAST 252+%d rows AFTER the day index. See predict_volatility_1year_ahead() for details.' % num_days assert day >= 0 '''Compile features for fitting''' feature_sets = [] value_sets = []; for ii in range(day+num_days+252, len(rows) - num_days): features = [] for jj in range(num_days): day_index = ii + jj features += [ float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9]), float(rows[day_index][10]), float(rows[day_index][11]), float(rows[day_index][12]), float(rows[day_index][13]), ] #print("issue here: " + str(rows[day_index][0])) feature_sets += [features] value_sets += [float(rows[ii-252][9])] '''Create Regressor and fit''' num_features = 16 rng = np.random.RandomState(1) regr = AdaBoostRegressor(CustomClassifier(), n_estimators=3, random_state=rng) regr.fit(feature_sets, value_sets) '''Get prediction features''' ii = day features = [] for jj in range( num_days ): day_index = ii + jj features += [ float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9]), float(rows[day_index][10]), float(rows[day_index][11]), float(rows[day_index][12]), float(rows[day_index][13]), ] return float(regr.predict([features]))
class Regressor(BaseEstimator): def __init__(self): self.clf = AdaBoostRegressor(RandomForestRegressor(n_estimators=500, max_depth=78, max_features=10), n_estimators=40) def fit(self, X, y): self.clf.fit(X, y) def predict(self, X): return self.clf.predict(X)
def ada_learning(labels, train, test): label_log=np.log1p(labels) # try 50 / 1.0 #boost GradientBoostingRegressor(n_estimators=200, max_depth=8, learning_rate=0.1) clf=AdaBoostRegressor(GradientBoostingRegressor(n_estimators=200, max_depth=8, learning_rate=0.1),n_estimators=50, learning_rate=1.0) model=clf.fit(train, label_log) preds1=model.predict(test) preds=np.expm1(preds1) return preds
def train_predict(train_id, test_id): # load libsvm files for training dataset Xs_train = [] ys_train = [] n_train = load_libsvm_files(train_id, Xs_train, ys_train) # load libsvm files for testing dataset Xs_test = [] ys_test = [] n_test = load_libsvm_files(test_id, Xs_test, ys_test) # models model = [] # ans ans_train = [] ans_test = [] # generate predictions for training dataset ps_train = [] for i in range(0, n_train): ps_train.append([0.0 for j in range(10)]) # generate predictions for testing dataset ps_test = [] for i in range(0, n_test): ps_test.append([0.0 for j in range(10)]) # fit models for i in range(10): l = np.array([ys_train[j][i] for j in range(n_train)]) clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=params['max_depth']), n_estimators=params['n_estimators'], learning_rate=params['learning_rate']) clf.fit(Xs_train[i].toarray(), l) print "[%s] [INFO] %d model training done" % (t_now(), i) preds_train = clf.staged_predict(Xs_train[i].toarray()) ans_train.append([item for item in preds_train]) # print "len(ans_train[%d]) = %d" % (i, len(ans_train[i])) print "[%s] [INFO] %d model predict for training data set done" % (t_now(), i) preds_test = clf.staged_predict(Xs_test[i].toarray()) ans_test.append([item for item in preds_test]) print "[%s] [INFO] %d model predict for testing data set done" % (t_now(), i) #print "len_ans_train=%d" % len(ans_train[0]) # predict for testing data set for i in range(params['n_estimators']): for j in range(10): tmp = min(i, len(ans_train[j]) - 1) for k in range(n_train): ps_train[k][j] = ans_train[j][tmp][k] tmp = min(i, len(ans_test[j]) - 1) for k in range(n_test): ps_test[k][j] = ans_test[j][tmp][k] print "%s,%d,%f,%f" % (t_now(), i + 1, mean_cos_similarity(ys_train, ps_train, n_train), mean_cos_similarity(ys_test, ps_test, n_test)) return 0
def AdaBoost(xTrain, yTrain, xTest, yTest, treeNum): rms = dict() for trees in treeNum: ab = AdaBoostRegressor(n_estimators = trees) ab.fit(xTrain, yTrain) yPred = ab.predict(xTest) rms[trees] = sqrt(mean_squared_error(yTest, yPred)) (bestRegressor, rmse) = sorted(rms.iteritems(), key = operator.itemgetter(1))[0] return bestRegressor, rmse
def ada_boost(data,classifier,sample): from sklearn.ensemble import AdaBoostRegressor from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.cluster import KMeans from sklearn.naive_bayes import GaussianNB func = GaussianNB() func = DecisionTreeRegressor() func = KMeans(n_clusters=2) clf = AdaBoostRegressor(func,n_estimators=300,random_state=random.RandomState(1)) clf.fit(data,classifier) print_result(clf,[sample])
class Regressor(BaseEstimator): def __init__(self): cl = RandomForestRegressor(n_estimators=10, max_depth=10, max_features=10) self.clf = AdaBoostRegressor(base_estimator = cl, n_estimators=100) def fit(self, X, y): self.clf.fit(X, y) def predict(self, X): return self.clf.predict(X) #RandomForestClassifier
def test_boston(): # Check consistency on dataset boston house prices. reg = AdaBoostRegressor(random_state=0) reg.fit(boston.data, boston.target) score = reg.score(boston.data, boston.target) assert score > 0.85 # Check we used multiple estimators assert len(reg.estimators_) > 1 # Check for distinct random states (see issue #7408) assert_equal(len(set(est.random_state for est in reg.estimators_)), len(reg.estimators_))
def test_sparse_regression(): """Check regression with sparse input.""" class CustomSVR(SVR): """SVR variant that records the nature of the training set.""" def fit(self, X, y, sample_weight=None): """Modification on fit caries data type for later verification.""" super(CustomSVR, self).fit(X, y, sample_weight=sample_weight) self.data_type_ = type(X) return self X, y = datasets.make_regression(n_samples=100, n_features=50, n_targets=1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) # Trained on sparse format sparse_classifier = AdaBoostRegressor( base_estimator=CustomSVR(probability=True), random_state=1 ).fit(X_train_sparse, y_train) # Trained on dense format dense_classifier = dense_results = AdaBoostRegressor( base_estimator=CustomSVR(probability=True), random_state=1 ).fit(X_train, y_train) # predict sparse_results = sparse_classifier.predict(X_test_sparse) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) # staged_predict sparse_results = sparse_classifier.staged_predict(X_test_sparse) dense_results = dense_classifier.staged_predict(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) sparse_type = type(X_train_sparse) types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([(t == csc_matrix or t == csr_matrix) for t in types])
def performAdaBoostReg(train, test, features, output): """ Ada Boost Regression """ clf = AdaBoostRegressor() clf.fit(train[features], train[output]) Predicted = clf.predict(test[features]) plt.plot(test[output]) plt.plot(Predicted, color='red') plt.show() return mean_squared_error(test[output],Predicted), r2_score(test[output], Predicted)
def do_adaboost(filename): df, Y = create_merged_dataset(filename) # Ideas: # Create a feature for accelerations e deacceleration. # Leave default base regressor for AdaBoost(decision tree). Extra trees were tried with catastrophic results. #ada = AdaBoostRegressor(n_estimators=350, learning_rate=0.05) ada = AdaBoostRegressor(n_estimators=500, learning_rate=1) #X = df.drop(['driver', 'trip', 'prob_points', 'prob_speed', 'prob_distance', 'prob_acceleration'], 1) X = df.drop(['driver', 'trip'], 1) ada.fit(X, Y) probs = ada.predict(X[:200]) return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
def predict_volatility_1year_ahead(rows, day): """ SUMMARY: Predict volatility 1 year into the future ALGORITHM: a) The predictor will train on all data up to exactly 1 year (252 trading days) before `day` b) The newest 10 days up to and including `day` will be used as the feature vector for the prediction i.e. if day = 0, then the feature vector for prediction will consist of days (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) if day = 10, then the feature vector for predictor input will be days (10, 11, 12, 13, 14, 15, 16, 17, 19) INPUT: minimum of (1 year + 10 days) of data before `day` (newest data is day=0) """ #num_days = 10 num_days = 10 # enforce that `day` is in the required range assert len(rows) >= 252+num_days + day, 'You need to have AT LEAST 252+%d rows AFTER the day index. See predict_volatility_1year_ahead() for details.' % num_days assert day >= 0 # compile features (X) and values (Y) feature_sets = [] value_sets = []; value_sets_index = [] for ii in range(day+252, len(rows) - num_days): features = [] for jj in range(num_days): day_index = ii + jj features += [float(rows[day_index][1]), float(rows[day_index][2]), float(rows[day_index][3]), float(rows[day_index][5]), float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9])] feature_sets += [features] value_sets += [float(rows[ii-252][9])] value_sets_index.append([ii-252]) # fit #regr = linear_model.Lasso(alpha=0.01,fit_intercept=False,normalize=False,max_iter=10000000) # they call lambda alpha rng = np.random.RandomState(1) regr = AdaBoostRegressor(CustomClassifier(), n_estimators=4, random_state=rng) #regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=2, random_state=rng) #regr = DecisionTreeRegressor(max_depth=4) regr.fit(feature_sets, value_sets) #print "Adaboost weights:", regr.estimator_weights_ ii = day features = [] for jj in range( num_days ): day_index = ii + jj +252 features += [float(rows[day_index][1]), float(rows[day_index][2]), float(rows[day_index][3]), float(rows[day_index][5]), float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9])] return float(regr.predict([features]))
def test_sample_weight_adaboost_regressor(): """ AdaBoostRegressor should work without sample_weights in the base estimator The random weighted sampling is done internally in the _boost method in AdaBoostRegressor. """ class DummyEstimator(BaseEstimator): def fit(self, X, y): pass def predict(self, X): return np.zeros(X.shape[0]) boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3) boost.fit(X, y_regr) assert_equal(len(boost.estimator_weights_), len(boost.estimator_errors_))
def initGrid(X,y): min_samples_split = [2,4,6,8] max_depth = [2,4,6,8] n_estimators=[50,100,150] bootstrap=[False, True] min_samples_leaf=[2,4,6,8] grid = { 'min_samples_split':min_samples_split, 'max_depth': max_depth, 'min_samples_leaf':min_samples_leaf } model = DecisionTreeRegressor(); gs = GridSearchCV(estimator=model, param_grid=grid, verbose=10, n_jobs=-1) gs.fit(X,y) print(gs.best_params_) search = AdaBoostRegressor(gs) search.fit(X,y) return search
def train_model(training, testing, window=5, n=5): X_train, y_train = prepare_data(training) X_test, y_test = prepare_data(testing) rf = RandomForestRegressor() rf.fit(X_train, y_train) predrf = rf.predict(X_test) print "mse for random forest regressor: ", mean_squared_error(predrf, y_test) gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.025) gb.fit(X_train, y_train) predgb = gb.predict(X_test) print "mse for gradient boosting regressor: ", mean_squared_error(predgb, y_test) ## plot feature importance using GBR results fx_imp = pd.Series(gb.feature_importances_, index=['bb', 'momentum', 'sma', 'volatility']) fx_imp /= fx_imp.max() # normalize fx_imp.sort() ax = fx_imp.plot(kind='barh') fig = ax.get_figure() fig.savefig("output/feature_importance.png") adb = AdaBoostRegressor(DecisionTreeRegressor()) adb.fit(X_train, y_train) predadb = adb.predict(X_test) print "mse for adaboosting decision tree regressor: ", mean_squared_error(predadb, y_test) scale = StandardScaler() scale.fit(X_train) X_trainscale = scale.transform(X_train) X_testscale = scale.transform(X_test) knn = BaggingRegressor(KNeighborsRegressor(n_neighbors=10), max_samples=0.5, max_features=0.5) knn.fit(X_trainscale, y_train) predknn = knn.predict(X_testscale) print "mse for bagging knn regressor: ", mean_squared_error(predknn, y_test) pred_test = 0.1*predrf+0.2*predgb+0.1*predadb+0.6*predknn print "mse for ensemble all the regressors: ", mean_squared_error(pred_test, y_test) result = testing.copy() result.ix[5:-5, 'trend'] = pred_test result.ix[10:, 'pred'] = pred_test * result.ix[5:-5, 'IBM'].values result.ix[:-5, 'pred_date'] = result.index[5:] return result
def svm_smooth(data, residual_imf, period): train_data = [] lable = [] for i in range(period,len(residual_imf)-20): tmp = data[i-period:i+1] train_data.append(tmp) lable.append(residual_imf[i]) rng = np.random.RandomState(1) clf = AdaBoostRegressor(svm.SVR(),n_estimators=1, random_state=rng) clf.fit(train_data, lable) smooth_data = [] for i in range(len(data)): if i<=period: smooth_data.append(data[i]) else: smooth_data.append(clf.predict([data[i-period:i+1]])[0]) return smooth_data
def run_tree_regressor(): from sklearn.tree import DecisionTreeRegressor from sklearn.cross_validation import cross_val_score from sklearn.cross_validation import train_test_split import numpy as np from sklearn.ensemble import AdaBoostRegressor print "running me" X = np.genfromtxt("/home/john/Downloads/kaggle.X1.train.txt",delimiter=",") # load the text file Y = np.genfromtxt("/home/john/Downloads/kaggle.Y.train.txt",delimiter=",") x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2) rng = np.random.RandomState(1) depth = 35 # current lowest for estimators in [130,235,300,345,450]: treeAdaBoost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=depth),n_estimators=estimators, random_state=rng) treeAdaBoost.fit(x_train, y_train) print "adabost estimators @ " + str(estimators) + ":", treeAdaBoost.score(x_test, y_test)
def round1(X, y): # Set parameters model = AdaBoostRegressor() n = len(y) # Perform 5-fold cross validation scores = [] kf = KFold(n, n_folds=5, shuffle=True) # Calculate mean absolute deviation for train/test for each fold for train_idx, test_idx in kf: X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] model.fit(X_train, y_train) prediction = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, prediction)) scores.append(rmse) return scores
def __init__(self, isTrain): super(RegressionUniformBlending, self).__init__(isTrain) # data preprocessing #self.dataPreprocessing() self.net1 = NeuralNet( layers=[ # three layers: one hidden layer ('input', layers.InputLayer), ('hidden', layers.DenseLayer), #('hidden2', layers.DenseLayer), #('hidden3', layers.DenseLayer), ('output', layers.DenseLayer), ], # layer parameters: input_shape=(None, 13), # input dimension is 13 hidden_num_units=6, # number of units in hidden layer #hidden2_num_units=8, # number of units in hidden layer #hidden3_num_units=4, # number of units in hidden layer output_nonlinearity=None, # output layer uses sigmoid function output_num_units=1, # output dimension is 1 # obejctive function objective_loss_function = lasagne.objectives.squared_error, # optimization method: update=lasagne.updates.nesterov_momentum, update_learning_rate=0.002, update_momentum=0.4, # use 25% as validation train_split=TrainSplit(eval_size=0.2), regression=True, # flag to indicate we're dealing with regression problem max_epochs=100, # we want to train this many epochs verbose=0, ) # Create linear regression object self.linRegr = linear_model.LinearRegression() # Create KNN regression object self.knn = neighbors.KNeighborsRegressor(86, weights='distance') # Create Decision Tree regression object self.decisionTree = DecisionTreeRegressor(max_depth=7, max_features=None) # Create AdaBoost regression object decisionReg = DecisionTreeRegressor(max_depth=10) rng = np.random.RandomState(1) self.adaReg = AdaBoostRegressor(decisionReg, n_estimators=400, random_state=rng) # Create linear regression object self.model = RandomForestRegressor(max_features='sqrt', n_estimators=32, max_depth=39)
def __init__(self, isTrain): super(RegressionAdaBoost, self).__init__(isTrain) # data preprocessing #self.dataPreprocessing() # Create AdaBoost regression object decisionReg = DecisionTreeRegressor(max_depth=10) rng = np.random.RandomState(1) self.adaReg = AdaBoostRegressor(decisionReg, n_estimators=400, random_state=rng)
def xgb_train(x_train, x_label, x_test): model = 'xgb' #model = 'adaboost' #if model.count('xgb') >0: params = {} params["objective"] = "reg:linear" params["eta"] = 0.005 # [0,1] params["min_child_weight"] = 6 params["subsample"] = 0.7 params["colsample_bytree"] = 0.7 params["scale_pos_weight"] = 1.0 params["silent"] = 1 params["max_depth"] = 9 if config.nthread > 1: params["nthread"] = 1 num_rounds = 10000 xgtrain = xgb.DMatrix(x_train, label=x_label) xgval = xgb.DMatrix(x_test) #train using early stopping and predict watchlist = [(xgtrain, "train")] #model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=120, feval=gini_metric) model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=120) pred1 = model.predict( xgval ) #clf = RandomForestRegressor() #clf = LogisticRegression() #clf = GradientBoostingRegressor() clf = AdaBoostRegressor( ExtraTreesRegressor(max_depth=9), n_estimators=200 ) clf.fit(x_train, x_label) pred2 = clf.predict(x_test) #pred = pred1 * pred2 / (pred1 + pred2) #pred = 0.7 * (pred1**0.01) + 0.3 * (pred2**0.01) #pred = (pred1.argsort() + pred2.argsort()) / 2 pred = 0.6 * pred1 + 0.4 * pred2 return pred
dat1 = df.loc[:, ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']] X_train, X_test, y_train, y_test = train_test_split(dat1, target, test_size = 0.2, random_state=42) y_train = y_train.values.ravel() models = [] models.append(('SVR', SVR())) models.append(('KNN', KNeighborsRegressor())) models.append(('DT', DecisionTreeRegressor())) models.append(('RF', RandomForestRegressor())) models.append(('l', Lasso())) models.append(('EN', ElasticNet())) models.append(('R', Ridge())) models.append(('BR', BayesianRidge())) models.append(('GBR', GradientBoostingRegressor())) models.append(('RF', AdaBoostRegressor())) models.append(('ET', ExtraTreesRegressor())) models.append(('BgR', BaggingRegressor())) scoring = 'neg_mean_squared_error' results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=42) cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg)
# Clear MORE unused variable to free memory del globals()['unqLikesUIDs'] del globals()['unqLikesLIDs'] del globals()['profilesDF'] del globals()['profiles'] del globals()['profilesLSo'] del globals()['profilesLS'] del globals()['row'] del globals()['tmpLS'] del globals()['tmpAGE'] del globals()['profsTOlikes'] del globals()['i'] del globals()['tmpIND'] # Training Model ############### print("training started") nEST = 100 lR = 1.0 adaBoost = AdaBoostRegressor(n_estimators=nEST, learning_rate=lR) adaBoost.fit(likesMAT, neusARR) print("training completed") # Save model ############### joblib.dump(adaBoost, "/Users/jamster/adaBoost-A-neus.xz", compress=9) print("model saved to disk") print("DONE")
label_list = [] for i in range(len(Candidates)): if output_piece.iloc[i,1] == training.iloc[L,10]: label_list.append(1) else: label_list.append(0) output_piece["lab"] = label_list Output = Output.append(output_piece) weight = len(Output) / sum(Output.lab) Output['Weight'] = Output['lab']*weight from sklearn.ensemble import AdaBoostRegressor m = AdaBoostRegressor() X = Output.drop(["We","Wc", "lab", "Weight"],axis=1) y = Output.lab m.fit(X,y,sample_weight=Output.Weight) for L in range(len(test)): if test.iloc[L,11] == False: We = test.iloc[L,5] output_piece = pd.DataFrame() for Threshold in range(20): Candidates = p4.candidate_search(Dictionary, We, Threshold) if len(Candidates) >= 10: break
def test_boston(): """Check consistency on dataset boston house prices.""" clf = AdaBoostRegressor(random_state=0) clf.fit(boston.data, boston.target) score = clf.score(boston.data, boston.target) assert score > 0.85
model = DecisionTreeRegressor(criterion="mse") model_name = "REGRESSION_TREE" elif selected_model == Model.RANDOM_FOREST: model = RandomForestRegressor(criterion="mse", n_estimators=20, min_samples_split=4, min_weight_fraction_leaf=0.01) model_name = "FOREST" elif selected_model == Model.EXTRA_TREE_REGRESSOR: model = ExtraTreesRegressor(criterion="mse") model_name = "EXTRA_TREE_REGRESSOR" elif selected_model == Model.GRADIENT_BOOSTING_REGRESSOR: model = GradientBoostingRegressor(loss="lad", n_estimators=200) model_name = "GRADIENT_BOOSTING_REGRESSOR" elif selected_model == Model.BAGGING_REGRESSOR: model = BaggingRegressor(oob_score=True) model_name = "BAGGING_REGRESSOR" elif selected_model == Model.ADABOOST_REGRESSOR: model = AdaBoostRegressor(loss="linear") model_name = "ADABOOST_REGRESSOR" else: Support.colored_print("No method selected!", "red") sys.exit(0) Support.colored_print("Method selected: " + model_name, "green") Support.colored_print("Training...", "green") t0 = time.time() model.fit(X[:train_size], y[:train_size]) model_fit = time.time() - t0 print(model_name + " complexity and bandwidth selected and model fitted in %.3f s" % model_fit) t0 = time.time() y_model = model.predict(X_plot) model_predict = time.time() - t0 print(model_name + " prediction for %d inputs in %.3f s" % (X_plot.shape[0], model_predict))
#Applied DecisionTreeRegressor technique and achieved accuracy of 72.76% from sklearn.tree import DecisionTreeRegressor tree_ = DecisionTreeRegressor() tree_.fit(X_train,y_train) y_pred_2 = tree_.predict(X_test) print("Accuracy is: "+ str(tree_.score(X_test,y_test) * 100) + "%") print("Mean Absolute Error: {}".format(mean_absolute_error(y_test,y_pred_2))) print("Mean Squared Error: {}".format(mean_squared_error(y_test,y_pred_2))) print("R Squared: {}".format(r2_score(y_test,y_pred_2))) #Applied AdaBoostRegressor technique and achieved accuracy of 40.82% from sklearn.ensemble import AdaBoostRegressor ada = AdaBoostRegressor(loss = "exponential") ada.fit(X_train,y_train) y_pred_3 = ada.predict(X_test) print("Accuracy is: "+ str(ada.score(X_test,y_test) * 100) + "%") print("Mean Absolute Error: {}".format(mean_absolute_error(y_test,y_pred_3))) print("Mean Squared Error: {}".format(mean_squared_error(y_test,y_pred_3))) print("R Squared: {}".format(r2_score(y_test,y_pred_3))) #Applied XGBRegressor technique and achieved accuracy of 87.05% from xgboost import XGBRegressor xgb = XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=7) xgb.fit(X_train,y_train) y_pred_8 = xgb.predict(X_test) print("Accuracy is: "+ str(xgb.score(X_test,y_test) * 100) + "%")
train)[:, 20], np.array(test)[:, :20], np.array(data_10)[:, :20], np.array( data_10)[:, 20], print 'train', np.array(train).shape print xtrain[1] print 'xtrain', xtrain.shape print 'ytrain', ytrain.shape print 'test', xtest.shape estimators = 100 #sup_vec = svm.SVC(C=11000, verbose = 2, probability=True) #sup_vec = RandomForestRegressor(n_estimators=estimators, verbose=2, n_jobs=-1, max_leaf_nodes=100) #sup_vec = ExtraTreesRegressor(n_estimators=estimators, verbose=2, n_jobs=-1, max_leaf_nodes=100) #sup_vec = AdaBoostRegressor(RandomForestRegressor(n_estimators=100, verbose=2, n_jobs = -1),n_estimators=100) sup_vec = AdaBoostRegressor(ExtraTreesRegressor(n_estimators=100, verbose=2, n_jobs=-1), n_estimators=160, loss='exponential') #sup_vec = AdaBoostRegressor(DecisionTreeRegressor(max_depth=10),n_estimators=300) #dt_stump = DecisionTreeClassifier(max_depth=4, min_samples_leaf=1) #dt_stump.fit(xtrain, ytrain) #dt_stump_err = 1.0 - dt_stump.score(xtrain, ytrain) #n_estimators = 400 # A learning rate of 1. may not be optimal for both SAMME and SAMME.R #learning_rate = 1. #sup_vec = AdaBoostClassifier( # base_estimator=dt_stump, # learning_rate=learning_rate,
filename = "blogData_train.csv" train_data = pd.read_csv(filename, header=None) #train_data = train_data.iloc[np.random.permutation(len(train_data))] train_output = train_data[len(train_data.columns) - 1] del train_data[len(train_data.columns) - 1] filename = "blogData_test-2012.02.01.00_00.csv" test_data = pd.read_csv(filename, header=None) #test_data = test_data.iloc[np.random.permutation(len(test_data))] test_output = test_data[len(test_data.columns) - 1] del test_data[len(test_data.columns) - 1] reg = LinearRegression() rf = RandomForestRegressor() gradBoost = GradientBoostingRegressor() ada = AdaBoostRegressor() #n_estimators=500 regressors = [reg, rf, gradBoost, ada] regressor_names = [ "Linear Regression", "Random Forests", "Gradient Boosting", "Adaboost" ] #regressors = ada #regressor_names = "Adaboost" for regressor, regressor_name in zip(regressors, regressor_names): regressor.fit(train_data, train_output) predicted_values = regressor.predict(test_data)
# importing necessary libraries import numpy as np from sklearn import metrics from sklearn.ensemble import AdaBoostRegressor from sklearn.tree import DecisionTreeRegressor print(__doc__) # Create the dataset rng = np.random.RandomState(1) X = np.linspace(0, 6, 100)[:, np.newaxis] y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0]) # dataArr, labelArr = loadDataSet("input/7.AdaBoost/horseColicTraining2.txt") # Fit regression model regr_1 = DecisionTreeRegressor(max_depth=4) regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=rng) regr_1.fit(X, y) regr_2.fit(X, y) # Predict y_1 = regr_1.predict(X) y_2 = regr_2.predict(X) # Plot the results plt.figure() plt.scatter(X, y, c="k", label="training samples") plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2) plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2) plt.xlabel("data") plt.ylabel("target") plt.title("Boosted Decision Tree Regression") plt.legend()
x_train = np.array(x_train) x_test = np.array(x_test) y_train = np.array(y_train) y_test = np.array(y_test) print() print("Train data shape : ", x_train.shape) print("Test data shape : ", x_test.shape) # defining models LR = LinearRegression() SVM = SVR() RF = RandomForestRegressor() KNN = KNeighborsRegressor() AB = AdaBoostRegressor() GB = GradientBoostingRegressor() # function for training and prediction def train_predict(model, trainX, trainY, testX, testY): model.fit(trainX, trainY) y_pred = model.predict(testX) acc_test = mse(testY, y_pred) return acc_test LR_acc = train_predict(LR, x_train, y_train, x_test, y_test) print("The error score of Linear Regression is : %f" % LR_acc) SVM_acc = train_predict(SVM, x_train, y_train, x_test, y_test)
def make_preds(start,stop,lookback): X = pd.read_pickle('Xdatanormalized.p') X = X.drop(['ebitdamultiple1','pctaffochange1','pctaffochange2','pctaffochange3','pctaffochange4'],axis=1) X = X[X['PMSector1']!='Hotel'] y = pd.read_pickle('Ydatanormalized.p') X = X.merge(y,how='left',on=['PMSector1','date_bom']) X = X.dropna(subset=['trt']) test_X = X[X['date_bom']==stop] test_sectors = test_X['PMSector1'] test_y = test_X['trt'] test_X = test_X.drop(['trt','date_bom','PMSector1'],axis=1) X = X[(abs(X['trt'])<0.20)] y = X['trt'] y = np.log(1+y) X = X.drop(['trt'],axis=1) X.reset_index(inplace=True) X_date_bom = pd.DataFrame(X['date_bom'],columns=['date_bom']) X=X[X.columns.difference(['date_bom','PMSector1','index'])] index_vals = X_date_bom pca = PCA() selection = SelectKBest() poly = PolynomialFeatures() combined_features = FeatureUnion([("pca", pca), ("univ_select", selection), ("poly",poly)]) ada = AdaBoostRegressor() rfr = RandomForestRegressor() imputer = Imputer() pipeline = Pipeline(steps=[ ('imputer',imputer), ('features',combined_features), ('regressor',ada)]) param_grid =[ { 'features__pca__n_components':[5], 'features__univ_select__k':[2], 'features__poly__degree':[1], 'regressor':[rfr], 'regressor__n_estimators':[400], 'regressor__criterion':['mse'], 'regressor__min_samples_leaf':[1,2], 'regressor__max_depth':[3,5,7], } , { 'features__pca__n_components':[5], 'features__univ_select__k':[2], 'features__poly__degree':[1], 'regressor':[ada], 'regressor__learning_rate':[1,0.1], 'regressor__n_estimators':[400] } ] tscv = custom_timeseries_within(index_vals = X_date_bom,lookback=lookback,test=stop) grid_search = GridSearchCV(pipeline, param_grid=param_grid,scoring =my_scorer, cv=tscv, verbose=1) grid_search.fit(X,y) print(grid_search.best_score_) print(grid_search.best_params_) preds = grid_search.predict(test_X) final = pd.concat([pd.DataFrame(preds.reshape(-1)),test_y.reset_index(drop=True)],axis=1,ignore_index=True) final = pd.concat([final,test_sectors.reset_index(drop=True)],axis=1,ignore_index=True) final.columns = ['pred_trt','actual_trt','PMSector1'] return(final)
grid_result = grid.fit(rescaledX, Y_train) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f, (%f) with : %r" % (mean, stdev, param)) # Ensemble ensembles = [] ensembles.append(('ScalesAB', Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostRegressor())]))) ensembles.append(('ScalesGBM', Pipeline([('Scaler', StandardScaler()), ('GBM', GradientBoostingRegressor())]))) ensembles.append(('ScalesRF', Pipeline([('Scaler', StandardScaler()), ('RF', RandomForestRegressor())]))) ensembles.append(('ScalesET', Pipeline([('Scaler', StandardScaler()), ('ET', ExtraTreesRegressor())]))) results = [] names = [] for name, model in ensembles: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results = cross_val_score(model,
from sklearn.ensemble import RandomForestRegressor # In[54]: rf = RandomForestRegressor(n_estimators=200, random_state=45) rf.fit(x_train, y_train) # In[55]: pred = rf.predict(x_test) pred # In[56]: from sklearn.ensemble import AdaBoostRegressor model = AdaBoostRegressor() model.fit(x_train, y_train) print(model.score(x_train, y_train)) abpred = model.predict(x_test) print(abpred) model.score(x_test, y_test) # In[57]: from sklearn.externals import joblib joblib.dump(abpred, 'abpredsave.obj') # # # Completed
min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=1, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=True, n_jobs=None, random_state=random_state, verbose=0, warm_start=False), AdaBoostRegressor(base_estimator=None, n_estimators=50, learning_rate=1.0, loss='linear', random_state=random_state), GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=random_state,
# Small Images idxs = np.arange(540)[::2] train_idxs, valid_idxs = train_test(idxs, 0.8) images = [ ki.img_to_array( ki.load_img('../data/images/%g.jpg' % id, target_size=(224, 224))) for id in np.arange(956) ] images = np.array(images) # X = train_idxs.reshape(-1,1) X = idxs.reshape(-1, 1) # X = images[train_idxs] # y = l_activity[train_idxs][:,8] y = l_activity[idxs][:, 8] VX = images[valid_idxs] Vy = l_activity[valid_idxs] dor = SKDeepOracleRegressor() abr = AdaBoostRegressor(base_estimator=dor, n_estimators=20) scores = cross_val_score(abr, X, y) import pdb pdb.set_trace()
modelfit(alg6, train, test, predictors, target, IDcol, 'alg6.csv') coef6 = pd.Series(alg6.feature_importances_, predictors).sort_values(ascending=False) coef6.plot(kind='bar', title='Feature Importances') plt.show() alg6_accuracy = round(alg6.score(X_train,Y_train) * 100,2) alg6_accuracy # In[85]: #AdaBoost Model from sklearn.ensemble import AdaBoostRegressor predictors = [x for x in train.columns if x not in [target]+IDcol] alg7= AdaBoostRegressor(n_estimators=2000, learning_rate=0.05) modelfit(alg7, train, test, predictors, target, IDcol, 'alg7.csv') coef7= pd.Series(alg7.feature_importances_, predictors).sort_values(ascending=False) coef7.plot(kind='bar', title='Feature Importances')#ft alg7_accuracy = round(alg7.score(X_train,Y_train) * 100,2) alg7_accuracy # In[86]: #Gradient Boost Model from sklearn.ensemble import GradientBoostingRegressor predictors = [x for x in train.columns if x not in [target]+IDcol] alg8 = GradientBoostingRegressor(n_estimators= 50, learning_rate= 0.03, max_depth= 4)
def book_meta_clf(self,meta_name="GradientBoostingRegressor", **params): """ Book the meta clf (set all the parameters) """ if meta_name == "AdaBoostRegressor": if "base_estimator" not in params: params["base_estimator"]=self.book_base_clf(base_name="DecisionTreeRegressor") if "n_estimators" not in params : params["n_estimators"] = 100 if "learning_rate" not in params : params["learning_rate"] = 1. if "loss" not in params : params["loss"] = "square" # {'linear,square,exponential} if "random_state" not in params : params["random_state"] = None clf = AdaBoostRegressor (**params) if meta_name == "BaggingRegressor": if "base_estimator" not in params: params["base_estimator"]=self.book_base_clf() if "n_estimators" not in params: params["n_estimators"] = 50 if "max_samples" not in params: params["max_samples"] = 1. if "max_features" not in params: params["max_features"] = 1. if "bootstrap" not in params: params["bootstrap"] = True if "bootstrap_features" not in params: params["bootstrap_features"] = False if "oob_score" not in params: params["oob_score"] = False if "warm_start" not in params: params["warm_start"] = False if "n_jobs" not in params: params["n_jobs"] = 20 if "random_state" not in params: params["random_state"] = None if "verbose" not in params: params["verbose"] = 1 clf = BaggingRegressor(**params) if meta_name == "ExtraTreesRegressor": if "n_estimators" not in params: params["n_estimators"] = 200 if "min_samples_split" not in params: params["min_samples_split"] = 2 if "min_samples_leaf" not in params: params["min_samples_leaf"] = 200 if "min_weight_fraction_leaf" not in params:params["min_weight_fraction_leaf"] = 0.02 if "max_features" not in params: params["max_features"] = 'auto' if "max_leaf_nodes" not in params: params["max_leaf_nodes"] = None if "bootstrap" not in params: params["bootstrap"] = False if "oob_score" not in params: params["oob_score"] = False if "warm_start" not in params: params["warm_start"] = False if "n_jobs" not in params: params["n_jobs"] = 20 if "random_state" not in params: params["random_state"] = None if "verbose" not in params: params["verbose"] = 1 clf = ExtraTreesRegressor(**params) if meta_name == "RandomForestRegressor": if "n_estimators" not in params: params["n_estimators"] = 100 if "min_samples_split" not in params: params["min_samples_split"] = 2 if "min_samples_leaf" not in params: params["min_samples_leaf"] = 200 if "min_weight_fraction_leaf" not in params:params["min_weight_fraction_leaf"] = 0.02 if "max_features" not in params: params["max_features"] = 'auto' if "max_leaf_nodes" not in params: params["max_leaf_nodes"] = None if "bootstrap" not in params: params["bootstrap"] = True if "oob_score" not in params: params["oob_score"] = False if "warm_start" not in params: params["warm_start"] = False if "n_jobs" not in params: params["n_jobs"] = 20 if "random_state" not in params: params["random_state"] = None if "verbose" not in params: params["verbose"] = 1 clf = RandomForestRegressor(**params) if meta_name == "GradientBoostingRegressor": if "n_estimators" not in params: params["n_estimators"] = 200 if "learning_rate" not in params : params["learning_rate"] = 1. if "loss" not in params : params["loss"] = "ls" #"huber" # linear, square, exponential if "min_samples_split" not in params: params["min_samples_split"] = 2 if "min_samples_leaf" not in params: params["min_samples_leaf"] = 50 if "min_weight_fraction_leaf" not in params:params["min_weight_fraction_leaf"] = 0.005#0.02 if "max_features" not in params: params["max_features"] = 'auto' if "max_leaf_nodes" not in params: params["max_leaf_nodes"] = None if "warm_start" not in params: params["warm_start"] = False if "random_state" not in params: params["random_state"] = None if "verbose" not in params: params["verbose"] = 1 if "alpha" not in params : params["alpha"] = 0.9 if "init" not in params : params["init"] = None if "presort" not in params : params["presort"] = 'auto' clf = GradientBoostingRegressor (**params) return clf
# input data housing_data = datasets.load_boston() # 打乱数据,random_state控制如何打乱数据 X , y = shuffle(housing_data.data,housing_data.target,random_state=7) # 80%作为训练样本,20%作为测试样本 num_training = int(0.8 * len(X)) X_train, y_train = X[:num_training], y[:num_training] X_test, y_test = X[num_training:], y[num_training:] # 训练 dt_regressor = DecisionTreeRegressor(max_depth=4) dt_regressor.fit(X_train,y_train) # 拟合 ab_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=400, random_state=7) ab_regressor.fit(X_train, y_train) # 预测 y_pred_dt = dt_regressor.predict(X_test) mse = mean_squared_error(y_test,y_pred_dt) evs = explained_variance_score(y_test,y_pred_dt) print("\n######决策树学习效果######") print('Mean squared error = ',round(mse, 2)) print('Explain variance error = ',round(evs, 2)) y_pred_ad = ab_regressor.predict(X_test) mse = mean_squared_error(y_test, y_pred_ad) evs = explained_variance_score(y_test, y_pred_ad) print('\n#####AdaBoost算法改善效果#####') print('Mean squared error = ',round(mse, 2))
def ContributingFeaturesByYear(model, year): fout.write('Years : ' + str(year) + ' - ' + str(year + 4) + '\n') print 'Years : ', year, ' - ', year + 4 df_filtered = df[(df['year'] >= year) & (df['year'] <= year + 4)] X = df_filtered[names] Y = df_filtered['WinPercentage'] #Convert the target to numpy array arr_target = Y.as_matrix() #Convert the dataframe to numpy array arr_X = X.as_matrix() arr_X_train, arr_X_val, arr_target_train, arr_target_val = train_test_split( arr_X, arr_target, test_size=0.1, random_state=20) #PCA num_comp = [] per_variance = [] fout.write('Variance vs No. Of Features\n') fout.write('-' * 30 + '\n') for n in (0.99, 0.95, 0.90, 0.75, 0.65, 0.5): p = PCA(n_components=n).fit(arr_X_train) #print(p.explained_variance_) print n * 100, len(p.explained_variance_) fout.write( str(n * 100) + '\t' + str(len(p.explained_variance_)) + '\n') num_comp.append(len(p.explained_variance_)) per_variance.append(n * 100) #num_comp.append(11) #the entire feature set #hyperparameters pca_val = num_comp alpha_val = np.logspace(-5, 5, num=11, base=2) c_val = np.logspace(-5, 5, num=11, base=2) #c for SVL and SVG g_val = np.logspace(-5, 5, num=11, base=2) #gamma for SVG if model == 'lr': pipe = Pipeline([('pca', PCA()), ('scaled', StandardScaler()), ('lr', linear_model.LinearRegression())]) gs = GridSearchCV(pipe, dict(pca__n_components=pca_val), cv=10) elif model == 'rr': pipe = Pipeline([('pca', PCA()), ('scaled', StandardScaler()), ('rr', linear_model.Ridge())]) gs = GridSearchCV(pipe, dict(pca__n_components=pca_val, rr__alpha=alpha_val), cv=10) elif model == 'rf': pipe = Pipeline([('pca', PCA()), ('scaled', StandardScaler()), ('rf', RandomForestRegressor(n_estimators=20, max_depth=4, random_state=5))]) gs = GridSearchCV(pipe, dict(pca__n_components=pca_val), cv=10) elif model == 'svrl': pipe = Pipeline([('pca', PCA()), ('scaled', StandardScaler()), ('svr_lin', SVR(kernel='linear', C=1))]) gs = GridSearchCV(pipe, dict(pca__n_components=pca_val, svr_lin__C=c_val), cv=10) elif model == 'svrg': pipe = Pipeline([('pca', PCA()), ('scaled', StandardScaler()), ('svr_gaussian', SVR(kernel='rbf', C=1, gamma=1))]) gs = GridSearchCV(pipe, dict(pca__n_components=pca_val, svr_gaussian__C=c_val, svr_gaussian__gamma=g_val), cv=10) elif model == 'adaboostRF': pipe = Pipeline([('pca', PCA()), ('scaled', StandardScaler()), ('adaboost', AdaBoostRegressor(RandomForestRegressor(), random_state=0))]) gs = GridSearchCV(pipe, dict(pca__n_components=pca_val), cv=10) elif model == 'adaboostSVR': pipe = Pipeline([('pca', PCA()), ('scaled', StandardScaler()), ('svr_adaboost', AdaBoostRegressor(SVR(), random_state=0))]) gs = GridSearchCV(pipe, dict(pca__n_components=pca_val), cv=10) gs.fit(arr_X_train, arr_target_train) #print gs.predict(arr_X_val) predictions = gs.predict(arr_X_val) print gs.score(arr_X_val, arr_target_val) print 'best_score' print gs.best_score_ fout.write('Accuracy : ' + str(gs.best_score_) + '\n') print 'best_estimator' print gs.best_estimator_ print 'best_params' print gs.best_params_ fout.write('Best Number of Features : ' + str(gs.best_params_) + '\n') fout.write('\n\nImportant Features :\n') fout.write('-' * 30 + '\n') #Find the k best features k_val = list(set(num_comp)) for j in range(0, len(k_val)): #if k_val[j] != 9: contributingFeatures = [] skb = SelectKBest(f_regression, k=k_val[j]) arr_X_train_reshape = skb.fit(arr_X_train, arr_target_train) #arr_patrons_sales_events_val_reshape = skb.transform(arr_patrons_sales_events_val) print 'The top ', k_val[j], ' features are: ' fout.write('The top ' + str(k_val[j]) + ' features are: \n') get_features = skb.get_support( ) #print True or False for the features depending on whether it matters for predicting the category or not for i in range(0, len(get_features)): if get_features[i]: contributingFeatures.append(cols_to_keep[i]) print i, cols_to_keep[i] fout.write(cols_to_keep[i] + '\n') fout.write('\n')
def model_training_regressor(X, Y, test_ratio, verbose_mode, name): X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_ratio, shuffle=False) if name == "MLP": model = MLPRegressor(hidden_layer_sizes=(200, 50), activation='relu', solver='adam', alpha=0.0002, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.01, power_t=0.5, max_iter=10000, shuffle=True, random_state=None, tol=0.0001, verbose=verbose_mode, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10).fit(X_train, y_train) return get_model_performance(model, X_test, y_test) elif name == "NaiveBayes": model = linear_model.BayesianRidge().fit(X_train, y_train) return get_model_performance(model, X_test, y_test) elif name == "SVM": model = svm.LinearSVR(random_state=0, tol=1e-5).fit(X_train, y_train) return get_model_performance(model, X_test, y_test) elif name == "DT": model = tree.DecisionTreeRegressor().fit(X_train, y_train) return get_model_performance(model, X_test, y_test) elif name == "KNN": model = neighbors.KNeighborsRegressor().fit(X_train, y_train) return get_model_performance(model, X_test, y_test) elif name == "RandomForest": model = RandomForestRegressor().fit(X_train, y_train) return get_model_performance(model, X_test, y_test) elif name == "Adaboost": model = AdaBoostRegressor().fit(X_train, y_train) return get_model_performance(model, X_test, y_test) elif name == "GradientBoost": model = GradientBoostingRegressor().fit(X_train, y_train) return get_model_performance(model, X_test, y_test) else: ret = dict() print("no available model") return ret
plt.scatter(dates,y_test,c="#ADD8E6",label="Actual Closing Stock Price") plt.title("Predicted Closing Price of Stock and Actual Closing Price of Stock using Random Forest Regressor",fontsize=20) plt.xlabel('Time in Years',fontsize=20) plt.ylabel('Stock Closing Price Predictions(Random Forest Regressor)',fontsize=20) plt.legend(loc='best') plt.show() # In[23]: #Test Case 1: The value of learning_rate is first set to 0.001 then the MSE obtained is 2.58 #Test Case 2: The value of learning_rate is first set to 0.001 then the MSE obtained is 2.5819 #Test Case 3: The value of learning_rate is first set to 0.000167 then the MSE obtained is 2.580 print("AdaBoost Regressor") regr_ada = AdaBoostRegressor(random_forest,n_estimators=30,random_state=1,learning_rate=0.000167) regr_ada.fit(df_train_norm,y_train) regr_ada_pred = regr_ada.predict(df_test_norm) acc_ada_mse = mean_squared_error(y_test,regr_ada_pred) print("MSE = "+ str(acc_ada_mse)) r2_ada = r2_score(y_test,regr_ada_pred) print("r2_score = " + str(r2_ada)) plt.figure(figsize=(20,20)) plt.scatter(dates,regr_ada_pred,label="Predicted Closing Stock Price") plt.scatter(dates,y_test,c="orange",label="Actual Closing Stock Price") plt.title("Predicted Closing Price of Stock and Actual Closing Price of Stock using ADA Boost Regressor",fontsize=20) plt.xlabel('Time in Years',fontsize=20) plt.ylabel('Stock Closing Price Predictions(AdaBoost-RandomForest Regressor)',fontsize=20) plt.legend(loc='best') plt.show()
#mse in $ mse = mean_absolute_error(y_test, y_pred) print("The mean absolute error is:$", mse) #chceking r^2 from sklearn.metrics import r2_score print("r_Score:", r2_score(y_test, y_pred)) bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10) bg.fit(X_train, y_train) bg.score(X_train, y_train) bg.score(X_test, y_test) #Adaboosting regr = AdaBoostRegressor() regr.fit(X_train, y_train) regr.score(X_test, y_test) #Decision from sklearn.tree import DecisionTreeRegressor dt = DecisionTreeRegressor() dt.fit(X_train, y_train) dt.score(X_test, y_test) #gradientBoost from sklearn.ensemble import GradientBoostingRegressor gb = GradientBoostingRegressor() gb.fit(X_train, y_train) gb.score(X_train, y_train) gb.score(X_test, y_test)
def test_regression_toy(): """Check classification on a toy dataset.""" clf = AdaBoostRegressor(random_state=0) clf = AdaBoostRegressor() clf.fit(X, y_regr) assert_array_equal(clf.predict(T), y_t_regr)
score = clf.score(X_test, Y_test) print(score) y_pred = clf.predict(X_test) names = [ "Decision Tree Regressor", "MLP Regressor", "Random Forest Regressor", "AdaBoost", "Bagging Regressor", "Extra Trees Regressor" ] classifiers = [ DecisionTreeRegressor(max_depth=5, max_features=1), MLPRegressor(alpha=1, max_iter=200, power_t=0.9, batch_size=50), RandomForestRegressor(max_depth=5, max_features=1, n_estimators=10), AdaBoostRegressor(n_estimators=10), BaggingRegressor(max_features=1, n_estimators=10, base_estimator=clf), ExtraTreesRegressor(max_depth=5) ] for name, clf in zip(names, classifiers): clf.fit(X_train, Y_train) score = clf.score(X_test, Y_test) y_pred = clf.predict(X_test) print(name + ": " + str(score)) mse = mean_squared_log_error(Y_test, y_pred) print('MSE: %.4f' % mse) # print(confusion_matrix(Y_test,y_pred,labels=None)) # print(cohen_kappa_score(Y_test,y_pred, labels=None)) # print(classification_report(Y_test,y_pred,labels=None))
) # instantiating AdaBoostClassifier abc = AdaBoostClassifier(n_estimators=100, random_state=0) abc.fit(trainFeat, trainLabels) print("Feature importances for AdaBoostClassifier: ") print(abc.feature_importances_) # make predictions for test data predictions = abc.predict(testFeat) accuracy = accuracy_score(testLabels, predictions) print("Accuracy of AdaBoostClassifier: %.2f%%" % (accuracy * 100.0)) cm = confusion_matrix(testLabels, predictions) # the count of true negatives is A00, false negatives is A10, true positives is A11 and false positives is A01 print('confusion matrix:\n %s' % cm) # instantiating AdaBoostRegressor (similar to logistic regression) abr = AdaBoostRegressor(random_state=0, n_estimators=100) abr.fit(trainFeat, trainLabels) print("Feature importances for AdaBoostRegressor: ") print(abr.feature_importances_) # make predictions for test data predictions = abr.predict(testFeat) accuracy = accuracy_score(testLabels, predictions.round()) print("Accuracy of AdaBoostRegressor: %.2f%%" % (accuracy * 100.0)) cm = confusion_matrix(testLabels, predictions.round()) # the count of true negatives is A00, false negatives is A10, true positives is A11 and false positives is A01 print('confusion matrix:\n %s' % cm) # instantiating XGBClassifier xgbc = XGBClassifier() xgbc.fit(trainFeat, trainLabels) print("Feature importances for XGBClassifier: ")
lgbm_early_stopping_rounds = 100 seed = 2017 # ############################################################################################################################################# # parameters : xgb regression ############################################################################################################### ############################################################################################################################################# randomforest = RandomForestRegressor(n_estimators=600, max_depth=10, n_jobs=20, random_state=2017, max_features="auto", verbose=1) adaboost = AdaBoostRegressor(n_estimators=30, random_state=2017, learning_rate=0.01) gbdt = GradientBoostingRegressor(learning_rate=0.04, n_estimators=100, subsample=0.8, random_state=2017, max_depth=5, verbose=1) extratree = ExtraTreesRegressor(n_estimators=600, max_depth=8, max_features="auto", n_jobs=20, random_state=2017, verbose=1) lr_reg = LinearRegression(n_jobs=-1)
def predict(self, X): return self.m.predict(X)[0] regressors = [ ("k-nearest Neighbors", None, KNeighborsRegressor(2)), ("SVM - Linear", None, SVR(kernel="linear")), ("SVM - RBF", None, SVR(gamma=2, C=1)), ("Decision Tree", None, DecisionTreeRegressor(min_samples_split=1024, max_depth=20)), ("Random Forest", None, RandomForestRegressor(n_estimators=10, min_samples_split=1024, max_depth=20)), ("AdaBoost", None, AdaBoostRegressor(random_state=13370)), ("Naive Bayes", None, GaussianNB()), #("Bagging with DTRegg", ["All"], BaggingRegressor(DecisionTreeRegressor(min_samples_split=1024, # max_depth=20))), #("GP isotropic RBF", None, gp.GaussianProcessRegressor(kernel=gp.kernels.RBF())), #("GP anisotropic RBF", ["All"], gp.GaussianProcessRegressor(kernel=gp.kernels.RBF(length_scale=np.array([1]*n_feats)))), ("GP ARD", ["All"], gp.GaussianProcessRegressor( kernel=ard_kernel(sigma=1.2, length_scale=np.array([1] * n_feats)))), #("GP isotropic matern nu=0.5", None, gp.GaussianProcessRegressor(kernel=gp.kernels.Matern(nu=0.5))), #("GP isotropic matern nu=1.5", None, gp.GaussianProcessRegressor(kernel=gp.kernels.Matern(nu=1.5))), ("GP Isotropic Matern", None, gp.GaussianProcessRegressor(kernel=gp.kernels.Matern(nu=2.5))), # bad performance ("GP Dot Product", ["CFS", "CIFE", "MFCC", "All"], gp.GaussianProcessRegressor(kernel=gp.kernels.DotProduct())),
from sklearn.ensemble import AdaBoostRegressor from sklearn.datasets import make_regression from joblib import dump X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False) regr = AdaBoostRegressor(random_state=0, n_estimators=100) regr.fit(X, y) dump(regr, 'model.joblib')
print(r2_score(y_train,model.predict(X_train))) model.intercept_ model.coef_ mean_squared_error(y_test,y_pred) r2_score(y_test,y_pred) r2_score(y_train,model.predict(X_train)) #DECISION TREE adaboostmodel = AdaBoostRegressor(DecisionTreeRegressor(max_depth=3),learning_rate = 4, n_estimators = 450) adaboostmodel.fit(xencoded, y) adaboostmodel.score(x_test, y_test) #PLOT ENCODED VALUES sns.pairplot(xencoded) # In[ ]: model4=RandomForestRegressor() grid_params_RF={ 'n_estimators':range(50,90,10), 'max_depth':[15,16,17,18,19,20,21] } clf4=GridSearchCV(model4,grid_params_RF,cv=4,scoring='r2') clf4.fit(x_scaled, y)
#df, attributes = preprocess.preprocess(df) attributes = list(df.columns.values)[1:] attributes.remove('DateTime') attributes.remove('PredDelay') # Initialise Regressors regressors = { 'gbr_reg': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls'), 'ada_reg': AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=np.random.RandomState(1)) } # Initialise Classifiers classifiers = { 'svm_clf': svm.SVC(), 'bernolli_rbm_clf': BernoulliRBM(n_components=2), 'decision_tree_clf': tree.DecisionTreeClassifier() } window_size = 20 window_start = 0 window_end = window_start + window_size print "Window Size: ", window_size