def regression(self, metric="root_mean_squared_error", folds=10, alphas=[], graph=False): size = 1.3 * self.report_width // 10 models = {} models["Linear regressor"] = lr() models["Lasso regressor"] = lassor() models["Lasso CV regressor"] = lassocvr() models["Ridge regressor"] = rr(alpha=0, normalize=True) models["Ridge CV regressor"] = rcvr(alphas = alphas) models["K nearest neighbors regressor K2u"] = knnr(n_neighbors=2, weights='uniform') models["K nearest neighbors regressor K2d"] = knnr(n_neighbors=2, weights='distance') models["K nearest neighbors regressor K5"] = knnr(n_neighbors=5) models["K nearest neighbors regressor K10"] = knnr(n_neighbors=10) models["SGD regressor"] = sgdr(max_iter=10000, warm_start=True) models["Decision tree regressor"] = dtr() models["Decision tree regressor D3"] = dtr(max_depth=3) models["Random forest regressor"] = rfr() models["Ada boost regressor"] = abr() models["Gradient boost regressor"] = gbr() models["Support vector regressor"] = svr() self.models = models print('\n') print(self.report_width * '*', '\n*') print('* REGRESSION RESULTS - BEFORE PARAMETERS BOOSTING \n*') #kf = StratifiedKFold(n_splits=folds, shuffle=True) kf = KFold(n_splits=folds) results = [] names = [] for model_name in models: cv_scores = -1 * cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, scoring=metric) results.append(cv_scores) names.append(model_name) print(self.report_width * '*', '') report = pd.DataFrame({'Regressor': names, 'Score': results}) report['Score (avg)'] = report.Score.apply(lambda x: x.mean()) report['Score (std)'] = report.Score.apply(lambda x: x.std()) report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)'] report.sort_values(by='Score (avg)', inplace=True) report.drop('Score', axis=1, inplace=True) display(report) print('\n') if graph: fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Regressor Comparison') #ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.xticks(rotation=45) plt.subplots_adjust(hspace=0.0) plt.show() return None
def best_model(xt, xv, yt, yv): models = [] name_dt = "DecisionTreeRegressor" model_dt = dtr(random_state=1) # decision tree model_dt.fit(xt, yt) models.append({'name': name_dt, 'model': model_dt, 'mae': get_mae(model_dt, xv, yv)}) name_rf = "RandomForestRegressor" model_rf = rfr(random_state=1) # random forest model_rf.fit(xt, yt) models.append({'name': name_rf, 'model': model_rf, 'mae': get_mae(model_rf, xv, yv)}) name_xgb = "XGBRegressor" model_xgb = xgb(random_state=1, n_estimators=10000, learning_rate=0.01) # xgboost model_xgb.fit(xt, yt, early_stopping_rounds=10, eval_set=[(xv, yv)], verbose=False) models.append({'name': name_xgb, 'model': model_xgb, 'mae': get_mae(model_xgb, xv, yv)}) print("\n") for m in models: print("Model {} has MAE {}".format(m.get('name'), m.get('mae'))) min_mae = min(i['mae'] for i in models) best_model = [m for m in models if m.get('mae') == min_mae] print("\nBest model pick: ", best_model[0].get('name')) print("\n") return best_model[0].get('model')
def __init__(self, pathToData): self.dataFilePath = pathToData self.algoname = 'Boosting' self.datasetName = 'Abalone' self.baseEstimater = dtr() self.classifier = abr(base_estimator=self.baseEstimater) self.cv = 5
def regressor(file, X, Y, x, y): param = [] acc = [] criterion = ['mse', 'friedman_mse', 'mae'] for i in it.product(criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, presort): # print(*i) dtree = dtr([*i]) dtree.fit(X, Y) # print('Accuracy: ' + str(dtree.score(x,y)) + '\n') acc.append(dtree.score(x, y)) param.append([*i]) _results(file, acc, param)
model = joblib.load('gbdt_125.model') model = joblib.load('gbdt_132.model') model = joblib.load('gbdt_151.model') model = joblib.load('gbdt_170.model') model.learning_rate = 0.005 model.n_estimators = 3000 model.max_leaf_nodes = 60 model.max_depth = 10 model.subsample = 0.16 model.max_features = 0.04 #lr=joblib.load('gbdt_97.model') #lr=model #model=GradientBoostingRegressor(init=lr,loss='ls',n_estimators=50,\ #learning_rate=0.01,max_depth=10,min_samples_leaf=20,\ #max_features=0.05,subsample=0.2,max_leaf_nodes=60) bm = dtr(max_depth=6, min_samples_leaf=2, max_leaf_nodes=60, splitter='random') #model=BaggingRegressor(base_estimator=bm,n_estimators=2000,bootstrap=True,\ #bootstrap_features=1,max_samples=0.16,max_features=0.05) model=AdaBoostRegressor(n_estimators=300,learning_rate=0.03,\ loss='square',base_estimator=bm) model.fit(xfit, yfit.flatten()) probs = 1 * model.predict(xval) + 1 * Yam.flatten() fpr, tpr, thresholds = roc_curve(yval, probs) roc_auc = auc(fpr, tpr) print(roc_auc) probs = f_predict1(xval, Yam) ytest = f_predict1(x0t, ytest0) probs = 1 * Yam.flatten()
# rng.rand(10) #生成10个随机数 # rng.rand(2,3) #生成2行3列的数据 #生成横坐标数据 X = np.sort( 5 * rng.rand(80, 1), axis=0 ) #我们需要80个随机数,但因为回归树模型只能输入二维数组,所以为了符合模型我们添加了一列(80,1)。rand()默认给的是[0-1)之间的数据,我们需要0-5之间的数组需要在此基础上x5 #生成纵坐标数据 Y = np.sin(X).ravel( ) #使用np的sin函数将X轴数据带入,生成完美的正弦曲线。纵坐标不同于横坐标,纵坐标只能是一维数组,所以需要使用ravel方法降维 #将y轴加上噪声,因为现实中是不可能存在完美的正弦曲线图。Y[::5] 行:列:步长 所有的行和所有的列每隔5个取一个,在每个点基础上利用0.5-numpy的[0-1)的随机数*3 扩大噪音的影响 Y[::5] += 3 * (0.5 - rng.rand(16)) #代入数据训练模型 dtr1 = dtr(max_depth=2) #设置模型最大深度2 dtr2 = dtr(max_depth=5) #设置模型最大深度5 dtr1.fit(X, Y) #代入训练1 dtr2.fit(X, Y) #代入训练2 #生成测试数据,也是使用numpy。 arange(起始点,结束点,步长),使用newaxis升维,因为fit接口只支持X轴的二维数组 X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis] #用训练好的模型测试数据,predict返回测试后的结果 Y1_test = dtr1.predict(X_test) Y2_test = dtr2.predict(X_test) #查看数据生成的图像 #=20散点图的大小 edgecolor="black" 边框颜色,c="darkorange"点的颜色,label="data" 纵坐标轴的数据 #label="max_depth=2" 折线图名称 ,linewidth=2线宽 plt.figure() #准备画布
#======= partition the data ===================================================================================================# # Partitioning the data in this way allows us to evaluate how our model might perform on data that it has never seen before. # If we train the model on all of the test data, it will be difficult to tell if overfitting has taken place. #==============================================================================================================================# # also state how many percentage from train data set, we want to take as test data set # In this example, about 33% of the data is devoted to the hold-out set. X_train, X_test, y_train, y_test = train_test_split(X, data['SalePrice'], random_state=42, test_size=.33) # fitting a decision tree regression model... #==============================================================================================================================# print('fitting a decision tree regression model...') DTR_1 = dtr(max_depth=None ) # declare the regression model form. Let the depth be default. # DTR_1.fit(X,Y) # fit the training data scores_dtr = cross_val_score( DTR_1, X_train, y_train, cv=10, scoring="explained_variance") # 10-fold cross validation print("scores for k=10 fold validation:", scores_dtr) print("Est. explained variance: %0.2f (+/- %0.2f)" % (scores_dtr.mean(), scores_dtr.std() * 2)) #==============================================================================================================================# sorted_scores = Feature_Ranking(X_train, y_train) estimators = [10, 20, 30, 40, 50, 60, 70, 80] # top 15... mean_rfrs, std_rfrs_upper, std_rfrs_lower = getModel(X_train, y_train, sorted_scores, 15, estimators) plotResults(mean_rfrs, std_rfrs_upper, std_rfrs_lower, 15, estimators)
def regression(self, metric, folds=10, alphas=[], printt=True, graph=False): size = self.graph_width # significant model setup differences should be list as different models models = {} models["Linear regressor"] = lr() models["Lasso regressor"] = lassor() models["Lasso CV regressor"] = lassocvr() models["Ridge regressor"] = rr(alpha=0, normalize=True) models["Ridge CV regressor"] = rcvr(alphas = alphas) models["Elastic net regressor"] = enr() models["K nearest neighbors regressor K2u"] = knnr(n_neighbors=2, weights='uniform') models["K nearest neighbors regressor K2d"] = knnr(n_neighbors=2, weights='distance') models["K nearest neighbors regressor K5"] = knnr(n_neighbors=5) models["K nearest neighbors regressor K10"] = knnr(n_neighbors=10) models["SGD regressor"] = sgdr(max_iter=10000, warm_start=True) models["Decision tree regressor"] = dtr() models["Decision tree regressor D3"] = dtr(max_depth=3) models["Random forest regressor"] = rfr() models["Ada boost regressor"] = abr() models["Gradient boost regressor"] = gbr() models["Support vector regressor RBF"] = svr() models["Support vector regressor Linear"] = svr('linear') models["Support vector regressor Poly"] = svr(kernel='poly') self.models = models kf = KFold(n_splits=folds, shuffle=True) results = [] names = [] et = [] for model_name in models: start = time.time() cv_scores = -1 * cross_val_score(models[model_name], self.Xt_train, self.yt_train, cv=kf, scoring=metric) results.append(cv_scores) names.append(model_name) et.append((time.time() - start)) report = pd.DataFrame({'Model': names, 'Score': results, 'Elapsed Time': et}) report['Score (avg)'] = report.Score.apply(lambda x: np.sqrt(x).mean()) report['Score (std)'] = report.Score.apply(lambda x: np.sqrt(x).std()) report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)'] report.sort_values(by='Score (avg)', inplace=True) report.drop('Score', axis=1, inplace=True) report.reset_index(inplace=True, drop=True) self.report_performance = report if printt: print('\n') print(self.report_width * '*', '\n*') print('* REGRESSION RESULTS - BEFORE PARAMETERS BOOSTING \n*') print(self.report_width * '*', '') print(report) print('\n') if graph: fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Regressor Comparison') #ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.xticks(rotation=45) plt.subplots_adjust(hspace=0.0, bottom=0.25) self.graphs_model.append(fig) plt.show() return None
litprop = literature_data['exponent'] X_train, X_test, y_train, y_test = train_test_split(litfeat, litprop, test_size=0.2, random_state=4) #-------------------------------linear regression train and test------------------------------------------------------------------ linreg = lr(normalize=True) linreg.fit(X_train, y_train) linreg_pred = linreg.predict(X_test) linreg_rmse = mean_squared_error(y_test, linreg_pred) print('linreg MAE: ' + str(sum(abs(linreg_pred - y_test))/(len(y_test)))) print('linreg RMSE: ' + str(np.sqrt(linreg_rmse))) #-------------------------------decision tree train and test------------------------------------------------------------------ dectree = dtr() dectree.fit(X_train,y_train) dectree_pred = dectree.predict(X_test) dectree_rmse = mean_squared_error(y_test, dectree_pred) print('dectree MAE: ' + str(sum(abs(dectree_pred - y_test))/(len(y_test)))) print('dectree RMSE: ' + str(np.sqrt(dectree_rmse))) #-------------------------------random forest train and test--------------------------------------------------------------------- randomforestmodel = rfr() randomforestmodel.fit(X_train, y_train) rf_pred = randomforestmodel.predict(X_test) rf_rmse = mean_squared_error(y_test, rf_pred) print('rf MAE: ' + str(sum(abs(rf_pred - y_test))/(len(y_test))))
print('\n') print('--- start ---') print('\n') # get data data = get_data(PATH) y = data.CO2 # set prediction metric X = data[FEATURES] # split to validation and training data train_X, val_X, train_y, val_y = tts(X, y, random_state=1) print('validation MAEs') # decision tree model_dt = dtr(random_state=1) model_dt.fit(train_X, train_y) get_mae(model_dt, val_X, val_y) # random forest model_rf = rfr(random_state=1) model_rf.fit(train_X, train_y) get_mae(model_rf, val_X, val_y) # xgboost model_xgb = xgb(random_state=1, n_estimators=10000, learning_rate=0.01) model_xgb.fit(train_X, train_y, early_stopping_rounds=10, eval_set=[(val_X, val_y)], verbose=False)
# Splitting the dataset into the Training set and Test set """from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)""" # Feature Scaling """from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) sc_y = StandardScaler() y_train = sc_y.fit_transform(y_train)""" # Fitting the Decision Tree Regression to the dataset from sklearn.tree import DecisionTreeRegressor as dtr regressor = dtr(random_state=0) regressor.fit(x, y) # Predicting a new result y_pred = regressor.predict(6.5) # Visualising the Regression results (for higher resolution and smoother curve) x_grid = np.arange(min(x), max(x), 0.01) x_grid = x_grid.reshape((len(x_grid), 1)) plt.scatter(x, y, color='red') plt.plot(x_grid, regressor.predict(x_grid), color='blue') plt.title('Truth or Bluff (Decision Tree Regression)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show()
X = np.load('data/X_boston.npy') y = np.load('data/y_boston.npy') X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) regressors = [ lr(), bay(), rr(alpha=.5, random_state=0), l(alpha=0.1, random_state=0), ll(), knn(), ard(), rfr(random_state=0, n_estimators=100), SVR(gamma='scale', kernel='rbf'), rcv(fit_intercept=False), en(random_state=0), dtr(random_state=0), ada(random_state=0), gbr(random_state=0) ] print('unscaled:', br) for reg in regressors: reg.fit(X_train, y_train) rmse, name = get_error(reg, X_test, y_test) name = reg.__class__.__name__ print(name + '(rmse):', end=' ') print(rmse) print() print('scaled:', br) scaler = StandardScaler() X_train_std = scaler.fit_transform(X_train) X_test_std = scaler.fit_transform(X_test)
def regression(self, folds=10, printt=True, graph=False): size = self.graph_width X = self.X y = self.y safra_range = list(range(len(X.safra.unique()))) models = {} models["Linear regressor"] = lr() models["Lasso CV regressor"] = lassocvr() models["Ridge CV regressor"] = rcvr() models["K nearest neighbors regressor K2u"] = knnr(n_neighbors=2, weights='uniform') models["K nearest neighbors regressor K2d"] = knnr(n_neighbors=2, weights='distance') models["K nearest neighbors regressor K5"] = knnr(n_neighbors=5) models["K nearest neighbors regressor K10"] = knnr(n_neighbors=10) models["Decision tree regressor"] = dtr() models["Decision tree regressor D3"] = dtr(max_depth=3) models["Random forest regressor"] = rfr() report = {"Model":[], "Score (avg)":[], "Score (std)":[], "Elapsed Time(s)":[]} for model_name in models: score_list = [] time_list = [] for i in range(folds): rand_ind = random.sample(safra_range,4) testX = X[X.safra.isin(rand_ind)] testy = y[y.index.isin(testX.index)] trainX = X[~X.safra.isin(rand_ind)] trainy = y[y.index.isin(trainX.index)] start = time.time() model = models[model_name].fit(trainX, trainy) score_list.append(model.score(testX, testy)) time_list.append(time.time()-start) report["Score (avg)"].append(np.mean(score_list)) report["Score (std)"].append(np.std(score_list)) report["Model"].append(model_name) report["Elapsed Time(s)"].append(np.mean(time_list)) report = pd.DataFrame.from_dict(report) report.sort_values(by='Score (avg)', inplace=True) report.reset_index(inplace=True, drop=True) best = report[-1:].values.tolist()[0] self.reg = best if printt: print('REGRESSION RESULTS') print(' Best regression method: ', best[0]) print(' Average score(R2): ', best[1]) print(' Standard Deviation: ', best[2]) print(' Elapsed Time(s): ', best[3], '\n') #display(report) if graph: model = models[best[0]].fit(trainX, trainy) self.pred = model.predict(testX) self.testy = testy fig, ax = plt.subplots() text = 'R2='+str(np.round(best[1],2)) ax.scatter(testy, self.pred, color='g') ax.set_xlabel("True values") ax.set_ylabel("Predictions") ax.text(0.05, 0.95 , text, transform = ax.transAxes, verticalalignment= 'top', bbox={'boxstyle':'square','facecolor':'none','edgecolor':'black'}) plt.show()
y = dt.PassengerId dtf = ['Survived', 'Pclass', 'Age','SibSp', 'Fare'] x = dt[dtf] x.describe() x.head() # In[ ]: from sklearn.tree import DecisionTreeRegressor as dtr dtm = dtr(random_state = 1) dtm.fit(x, y) # In[ ]: from sklearn.metrics import mean_absolute_error as mae pdp = dtm.predict(x) mae(y, pdp) # In[ ]:
#per data result values meth = [] mse_m = [] rmse_m = [] mae_m = [] mdae_m = [] evs_m = [] r2_m = [] #Parameter Values k = list(param['SVR Kernel'])[0] md = list(param['DTR Max Depth'])[0] deg = list(param['PR Degree'])[0] #Creating models mlr = lm.LinearRegression() svr = SVR(kernel=k, epsilon=0.1, C=1) dt = dtr(max_depth=md) poly = pf(degree=deg) pr = lm.LinearRegression() c = 0 #Repeated K Fold Cross Validation for tr_i, ts_i in rkf.split(data): print(i, c) train, test = data.iloc[tr_i], data.iloc[ts_i] train_x = train.drop(columns=['Index', 'District', 'Rainfall']) train_y = train['Rainfall'] test_x = test.drop(columns=['Index', 'District', 'Rainfall']) test_y = test['Rainfall'] poly_tr = poly.fit_transform(train_x) poly_ts = poly.fit_transform(test_x) #Fitting the data in the model mlr.fit(train_x, train_y)
scoring='neg_mean_absolute_error') grid_result = gridsearch.fit(X_train, y_train) grid_pred = gridsearch.predict(X_test) grid_rmse = mean_squared_error(y_test, grid_pred) print('Ridge MAE: ' + str(sum(abs(grid_pred - y_test))/(len(y_test)))) print('Ridge RMSE: ' + str(np.sqrt(grid_rmse))) print(grid_result.best_params_) print(abs(grid_result.best_score_)) #-------------------------DECISION TREE GRIDSEARCH---------------------------------------- """ the best values for each parameter came out as: 'max_depth'=12, 'min_samples_leaf'=1, and 'min_samples_split'=2 using Dataset 3 """ gridsearch = GridSearchCV(estimator=dtr(random_state=4), cv=5, param_grid={ 'max_depth':[10,20,30,40,50], 'min_samples_split':[2,3,4,5], 'min_samples_leaf':[1,2,3,4,5] }, scoring='neg_mean_absolute_error') grid_result = gridsearch.fit(X_train, y_train) grid_pred = gridsearch.predict(X_test) grid_rmse = mean_squared_error(y_test, grid_pred) print('Decision Tree MAE: ' + str(sum(abs(grid_pred - y_test))/(len(y_test)))) print('Decision Tree RMSE: ' + str(np.sqrt(grid_rmse))) print(grid_result.best_params_) print(abs(grid_result.best_score_))
#Split the your data as trainning and test sets train_X, test_X, train_y, test_y = tts(y, X, train_size = 0.33, test_size = 0.33, random_state = 42) print(len(train_X)) print(len(train_y)) print(len(test_X)) print(len(test_y)) # In[ ]: #Classifying the splited data and check accuracy model = dtr() model.fit(train_X, train_y) a = model.score(test_X, test_y) print('Score with model', a) z = cs(model, test_X, test_y) print('This is error in list', z) # In[ ]: #Predict your data prediction = model.predict(test_X)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder = LabelEncoder() features[:, 1] = labelencoder.fit_transform(features[:, 1]) features[:, 3] = labelencoder.fit_transform(features[:, 3]) features[:, 4] = labelencoder.fit_transform(features[:, 4]) features[:, 5] = labelencoder.fit_transform(features[:, 5]) onehotencoder = OneHotEncoder(categorical_features=[1, 3, 4, 5]) features = onehotencoder.fit_transform(features).toarray() labels[:, 0] = labelencoder.fit_transform(labels[:, 0]) onehotencoder = OneHotEncoder(categorical_features=[0]) labels = onehotencoder.fit_transform(labels).toarray() from sklearn.tree import DecisionTreeRegressor as dtr prog = dtr(random_state=0) prog.fit(features, labels) a = np.array([0, 1, 1, 0, 0, 0, 1, 0, 1, 10, 4]).reshape(1, -1) pred = prog.predict(a) features_grid = np.arange(min(features), max(features), 0.01) features_grid = features_grid.reshape((len(features_grid)), 1) plt.scatter(features, labels, color='red') plt.plot(features_grid, prog.predict(features_grid), color='blue') plt.title('Hire or Not Hire(Decision type Regression)') plt.xlabel('Year of experience') plt.ylabel('Hire') plt.show()
rmse_d = [] mae_d = [] mdae_d = [] evs_d = [] r2_d = [] c = 0 #Repeated K Fold Cross Validation for tr_i, ts_i in rkf.split(data): train, test = data.iloc[tr_i], data.iloc[ts_i] train_x = train.drop(columns=['District', 'Index', 'Rainfall']) train_y = train['Rainfall'] test_x = test.drop(columns=['District', 'Index', 'Rainfall']) test_y = test['Rainfall'] for j in dep: print(i, c, j) dt = dtr(max_depth=j) dt.fit(train_x, train_y) dt_p = dt.predict(test_x) #Error values d.append(j) mse_d.append(mse(test_y, dt_p)) rmse_d.append(rmse(test_y, dt_p)) mae_d.append(mae(test_y, dt_p)) mdae_d.append(mdae(test_y, dt_p)) evs_d.append(evs(test_y, dt_p)) r2_d.append(r2(test_y, dt_p)) c += 1 t = {} t['Depth'] = d t['MSE'] = mse_d t['RMSE'] = rmse_d