def __fit_regressors(self): print('Fitting regressors...') for method in self.cluster_methods: clusters = self.__get_cluster_labels(method) model = self.models[method] for label in clusters: for regressor in self.regressors: # Training separate knn for each cluster if regressor == 'knn': model[label]['knn'] = {} model[label]['knn']['model'] = KNeighborsRegressor( n_neighbors=5, weights='distance') model[label]['knn']['model'].fit( model[label]['X_train'], model[label]['Y_train']['price']) elif regressor == 'lr': model[label]['lr'] = {} model[label]['lr']['model'] = LinearRegression( normalize=True) model[label]['lr']['model'].fit( model[label]['X_train'], model[label]['Y_train']['price']) elif regressor == 'adaboost': model[label]['adaboost'] = {} model[label]['adaboost']['model'] = AdaBoostRegressor( n_estimators=100, learning_rate=0.2, loss='exponential') model[label]['adaboost']['model'].fit( model[label]['X_train'], model[label]['Y_train']['price']) elif regressor == 'gradientboosting': model[label]['gradientboosting'] = {} model[label]['gradientboosting'][ 'model'] = GradientBoostingRegressor( n_estimators=400, learning_rate=0.1, loss='ls', max_depth=5, min_samples_split=2) model[label]['gradientboosting']['model'].fit( model[label]['X_train'], model[label]['Y_train']['price']) elif regressor == 'randomforest': model[label]['randomforest'] = {} model[label]['randomforest'][ 'model'] = RandomForestRegressor(n_estimators=400) model[label]['randomforest']['model'].fit( model[label]['X_train'], model[label]['Y_train']['price']) elif regressor == 'decisiontree': model[label]['decisiontree'] = {} model[label]['decisiontree'][ 'model'] = DecisionTreeRegressor() model[label]['decisiontree']['model'].fit( model[label]['X_train'], model[label]['Y_train']['price']) elif regressor == 'xgboost': model[label]['xgboost'] = {} model[label]['xgboost'][ 'model'] = xgboost.XGBRegressor(n_estimators=900, learning_rate=0.05, max_depth=5) model[label]['xgboost']['model'].fit( model[label]['X_train'], model[label]['Y_train']['price']) elif regressor == 'pr2': # Need to create and fit poly features in same function model[label]['pr2']['model'] = LinearRegression( normalize=True) model[label]['pr2']['model'].fit( model[label]['pr2']['X_train'], model[label]['Y_train']['price']) elif regressor == 'pr3': # Need to create and fit poly features in same function model[label]['pr3']['model'] = LinearRegression( normalize=True) model[label]['pr3']['model'].fit( model[label]['pr3']['X_train'], model[label]['Y_train']['price'])
def xgboost(booster = 'gblinear', use_log = True, scale = True, use_dum = False): ''' runs a xgboost. -------------------------------- parameters: - booster: ['gblinear', 'gbtree', 'dart'] is the booster of xgboost. - use_log: if True, the target will be log(SalePrice) of houses. - scale: if True, features will be standardized. - use_dum: if True, categorical features will be dummified, otherwise they will be equal to the corresponding mean target for each level of the feature. ''' # preparing the data if use_dum: data = pd.read_csv('../derivedData/train_cleaned.csv', index_col='Id') else: data = pd.read_csv('../derivedData/train_NotDum.csv', index_col='Id') data['logSalePrice'] = np.log(data['SalePrice']) if not use_dum: cols_to_enc = data.columns[data.dtypes == 'object'] for col in cols_to_enc: if use_log: gp = data.groupby(col)['logSalePrice'].mean() else: gp = data.groupby(col)['SalePrice'].mean() data[col] = data[col].apply(lambda x: gp[x]) X = data.drop(['SalePrice', 'logSalePrice'], axis=1) if not use_log: y = data['SalePrice'] else: y = data['logSalePrice'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) if scale: ss = StandardScaler() ss.fit(X_train) X_train = pd.DataFrame(ss.transform(X_train)) X_test = pd.DataFrame(ss.transform(X_test)) if booster == 'gblinear': xgb_param = { 'alpha': [0], 'lambda': np.linspace(0, .2, 200) } elif booster == 'gbtree': xgb_param = { 'max_depth': [2, 3], 'min_child_weight': np.linspace(5, 15, 20), 'lambda': np.linspace(1, 10, 20), 'alpha': [0] } elif booster == 'dart': xgb_param = { 'max_depth': [2], 'min_child_weight': np.linspace(10, 15, 6), 'lambda': np.linspace(0, 2, 4), 'alpha': [0], 'sample_type': ['uniform'], 'normalize_type': ['tree'], 'rate_drop': np.linspace(.5, 1, 8), 'skip_drop': np.linspace(.5, 1, 10) } xgboost = xgb.XGBRegressor(booster=booster) grid_search_xgb = GridSearchCV(xgboost, xgb_param, cv=4) grid_search_xgb.fit(X_train, y_train) return grid_search_xgb
# In[42]: y_pred = log_model.predict(val_features) print('Accuracy of logistic regression classifier on test set: {:.2f}'.format( log_model.score(val_features, val_labels))) print(classification_report(val_labels, y_pred)) #Achieved a score of 0.756 on kaggle, let's try XGBoost Model now! # In[43]: #Building XGBoost model xg_reg = xgb.XGBRegressor(objective='binary:logistic', colsample_bytree=0.2, learning_rate=0.1, max_depth=7, alpha=10, n_estimators=20, scale_pos_weight=20) xg_reg.fit(train_features, train_labels) # In[44]: preds = xg_reg.predict(val_features) # In[45]: preds = np.where(preds > 0.6, 1, 0) print(classification_report(val_labels, preds)) # In[46]:
test_size=0.3, random_state=0) svc_model_poly = svm.SVC(kernel='poly', degree=5) svc_model_poly.fit(X1_train, y_train) predictions_poly = svc_model_poly.predict(X1_test) print( "nbr of features: ", 2200, " PCA accuracy with POLY SVM " + str(100 * accuracy_score(y_test, predictions_poly)) + '%') #7.2 Random Forest rf = RandomForestRegressor(n_estimators=1000, random_state=42) rf.fit(X1_train, y_train) y_pred = rf.predict(X1_test) print("Accuracy:", metrics.accuracy_score(y_test, y_pred.round()) * 100) classification_report(y_test, y_pred.round()) #7.3 XGBOOST data_dmatrix = xg.DMatrix(data=X1, label=y) xg_reg = xg.XGBRegressor(objective='binary:logistic', colsample_bytree=0.3, learning_rate=0.1, max_depth=20, alpha=150, n_estimators=1000) xg_reg.fit(X1_train, y_train) preds = xg_reg.predict(X1_test) print("Accuracy:", metrics.accuracy_score(y_test, preds.round()) * 100) rmse = np.sqrt(mean_squared_error(y_test, preds)) print("RMSE: %f" % (rmse))
y_wind, test_size=0.3, random_state=42) #data transformation (scaling) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) #creation of regressor model xgb_model = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1, max_depth=5, alpha=10, n_estimators=10) #fitting model xgb_model.fit(X_train, y_train) # fit model #predicting y_predicted_w = xgb_model.predict(X_test) #accuracy determination of random forest regression rmse = np.sqrt(mean_squared_error(y_test, y_predicted_w)) pickle.dump(xgb_model, open('model_w.pkl', 'wb'))
x_train = df_churn[var_select].apply(pd.to_numeric, errors='coerce') y_train = df_churn['churn_flag'] x_train, x_eval_xgb, y_train, y_eval_xgb = train_test_split(x_train, y_train, test_size=0.2, random_state=42) # Split training data again into training and test set x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42) # Specify the kind of model to develop xgb_reg = xgb.XGBRegressor(objective='binary:logistic', colsample_bytree=0.3, learning_rate=0.1, max_depth=5, n_estimators=350) xgb_reg.fit(x_train, y_train) preds = xgb_reg.predict(x_test) # Check accuracy of predictions accuracy_xgb = confusion_matrix(y_test.values, np.round(preds)) print(sum(np.diagonal(accuracy_xgb)) / sum(sum(accuracy_xgb))) print(accuracy_xgb) # Obtain the most important variables scores_xgb = xgb_reg.get_booster().get_score(importance_type='gain') most_imp_xgb = pd.DataFrame({ 'feature': list(scores_xgb.keys()),
def xgb_reg(self, para): reg = xgb.XGBRegressor(**para['reg_params']) return self.train_reg(reg, para)
X_test = all_data[train.shape[1]:] y = train.SalePrice # Import XGBoost import xgboost as xgb dtrain = xgb.DMatrix(X_train, label=y) dtest = xgb.DMatrix(X_test) params = {"max_depth": 2, "eta": 0.1} model = xgb.cv(params, dtrain, num_boost_round=500, early_stopping_rounds=100) model.loc[30:, ["test-rmse-mean", "train-rmse-mean"]].plot() model_xgb = xgb.XGBRegressor( n_estimators=360, max_depth=2, learning_rate=0.1) #the params were tuned using xgb.cv model_xgb.fit(X_train, y) xgb_preds = np.expm1(model_xgb.predict(X_test)) lasso_preds = np.expm1(model_lasso.predict(X_test)) predictions = pd.DataFrame({"xgb": xgb_preds, "lasso": lasso_preds}) predictions.plot(x="xgb", y="lasso", kind="scatter") preds = 0.7 * lasso_preds + 0.3 * xgb_preds solution = pd.DataFrame({"id": test.Id, "SalePrice": preds}) solution_csv = solution.to_csv("ridge_sol.csv", index=False)
def fit(self, train_data): # 定义基本模型 self.base_models_ = [list() for x in self.base_models] # 定义元模型 self.meta_model_ = clone(self.meta_model) shape_ = [train_data[train_data[self.fe_] == d].shape[0] for d in self._slip] y_true = np.array([]) for d in self._slip: y_true = np.hstack((y_true, train_data.loc[train_data.day == d, self.target_].values)) index = [] for k, sh in enumerate(shape_): if k == 0: index.append(list(range(sh))) else: index.append(list(range(index[-1][-1], index[-1][-1]+shape_[k]))) # 设置用于元模型的特征大小 oof_pred = np.zeros((sum(shape_), len(self.base_models))) # 训练基础模型 for i, model_name in enumerate(self.base_models): for j, date in enumerate(self._slip): # 设置训练集和验证集 train = train_data[train_data[self.fe_]<date] valid = train_data[train_data[self.fe_]==date] X_train = train[self.features].values X_eval = valid[self.features].values y_train = train[self.target_].values y_eval = valid[self.target_].values if model_name =='lgb': lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train) print("开始训练{}_{}".format(i, j)) model = lgb.train(lgb_params, lgb_train, num_boost_round=10000, valid_sets=[lgb_train, lgb_eval], valid_names=['train', 'valid'], early_stopping_rounds=200, verbose_eval=1000,) y_pred = model.predict(X_eval) print("结束本次训练!") if model_name == 'cat': cat_train = Pool(X_train, y_train) cat_eval = Pool(X_eval, y_eval) print("开始训练{}_{}".format(i, j)) model = catboost.train( pool = cat_train, params=cat_params, eval_set=cat_eval, num_boost_round=50000, verbose_eval=5000, early_stopping_rounds=200,) y_pred = model.predict(X_eval) print("结束本次训练!") if model_name == 'xgb': print("开始训练{}_{}".format(i, j)) model = xgb.XGBRegressor(**xgb_params) #print(X_train.shape) model.fit(X_train, y_train, eval_set=[(X_eval, y_eval)], early_stopping_rounds=400, verbose=1000) y_pred = model.predict(X_eval) print("结束本次训练!") self.base_models_[i].append(model) oof_pred[index[j], i] = y_pred self.meta_model_.fit(oof_pred, y_true) return self
from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test) import xgboost as xgb regressor = xgb.XGBRegressor( colsample_bytree=0.2, gamma=0.0, learning_rate=0.01, max_depth=4, min_child_weight=1.5, n_estimators=7200, reg_alpha=0.9, reg_lambda=0.6, subsample=0.2, seed=42, silent=1) regressor.fit(x_train, y_train) y_pred_xgb = regressor.predict(x_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm_KNN = confusion_matrix(y_test, y_pred_xgb)
#final_train_matrix = train.drop(['user_id', 'day_of_week', 'record_date'], axis=1).as_matrix() final_train_matrix = np.row_stack( (train1_matrix, train2_matrix, train3_matrix, train4_matrix, train5_matrix, train6_matrix, train7_matrix, train8_matrix, train9_matrix, train10_matrix)) train_X = final_train_matrix[:, :-1] train_Y = final_train_matrix[:, -1] print "make test datset" final_test_matrix = final_test.drop(['user_id', 'day_of_week', 'record_date'], axis=1).as_matrix() test_matrix_X = final_test_matrix[:, :-1] test_matrix_Y = final_test_matrix[:, -1] print("hyper-parameter optimization...................") xgb_model = xgb.XGBRegressor() params = { 'max_depth': [2, 3, 4, 5, 6], 'learning_rate': [0.05, 0.1, 0.15], 'n_estimators': [50, 100, 150, 200], 'max_delta_step': [1], 'objective': [ 'reg:linear', 'reg:gamma', 'reg:tweedie', ] } # , 'colsample_bytree':[1], 'colsample_bylevel':[1], 'reg_alpha':[0], 'reg_lambda':[1], 'scale_pos_weight':[1], 'base_score':[0.5], 'seed':[0], 'missing':[None],'nthread':[-1], 'gamma':[0], 'min_child_weight':[1], , 'subsample':[0.5,0.8,1] gridsearchcvRegression = GridSearchCV(xgb_model, params, iid=True,
y2 = data.iloc[:, 9:10] X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.3, train_size=0.7, random_state=0) X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.3, train_size=0.7, random_state=0) #y1 output regr_1 = xgboost.XGBRegressor(n_estimators=100, max_depth=5) regr_1.fit(X1_train, y1_train.values.ravel()) #predict y1_test_pred = regr_1.predict(X1_test) y1_train_pred = regr_1.predict(X1_train) error_y1_test = ( (abs(y1_test['Y1'].values - y1_test_pred)) * 100) / y1_test['Y1'].values error_y1_train = ( (abs(y1_train['Y1'].values - y1_train_pred)) * 100) / y1_train['Y1'].values error_y1_test_mean = np.mean(error_y1_test) error_y1_train_mean = np.mean(error_y1_train)
def build_xgb_regr(features, labels): return xgb.XGBRegressor().fit(features, labels)
print("Minimum validation MSE:", min_val_error) # 0.002712853325235463 is the same as the model above break # early stopping # XGBoost # not shown in the book if False: # cannot run this code for dll file problem. try: import xgboost print('importing XGBoost') except ImportError as ex: print("Error: the xgboost library is not installed.") xgboost = None if xgboost is not None: xgb_reg = xgboost.XGBRegressor(random_state=42) xgb_reg.fit(X_train, y_train) y_pred = xgb_reg.predict(X_val) val_error = mean_squared_error(y_val, y_pred) print("Validation MSE:", val_error) if xgboost is not None: # not shown in the book xgb_reg.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=2) y_pred = xgb_reg.predict(X_val) val_error = mean_squared_error(y_val, y_pred) print("Validation MSE:", val_error) # Stacking # data set
X_train,X_eval,Y_train,Y_eval=train_test_split(X,Y,test_size=.1, random_state=5) X_test=df_test.loc[:,feature_names].values X_train=pd.DataFrame(X_train,columns=feature_names) X_eval=pd.DataFrame(X_eval,columns=feature_names) X_test=pd.DataFrame(X_test,columns=feature_names) #===feature extraction=== date=str(pd.to_datetime(ctime()).date()) bst=xgb.XGBRegressor(max_depth=10,booster='gbtree', learning_rate=.1,n_estimators=5000, subsample=.9, colsample_bytree=.9, reg_lambda=10, silent=False) if modelFitFlg==1: print(ctime()+'...training model...') bst.fit(X=X_train,y=Y_train, eval_set=[(X_eval,Y_eval)], eval_metric=['rmse'],early_stopping_rounds=10) joblib.dump(bst,join(fittedModelDir, 'model6_nonCV_{}{}'.format(date,'.pkl'))) #===make prediction for test set== fittedMdlPath='/home/arash/MEGA/MEGAsync/Machine Learning/'+\
'n_estimators': [1000], 'early_stopping_rounds': [10], 'booster': ['gbtree'], 'verbosity': [1], 'subsample': list(np.linspace(0.25, 1, 4)), #'learning_rate':[0.0001,0.001,0.01,0.1], #'eval_set': [[(X_test,y_test)]], 'gamma': [0, 0.0001, 0.001, 0.01, 0.1], 'eval_metric': ['rmse'], 'verbose': [True], 'silent': [False], 'min_child_weight': list(np.arange(1, X_test.shape[1], 5)), 'n_estimators': [10, 100, 200, 300, 1000] }] xg_reg = xgb.XGBRegressor() # Applying Grid Search to find the best model and the best parameters from hypopt import GridSearch from sklearn.model_selection import GridSearchCV grid_search = GridSearch(model=xg_reg, param_grid=parameters) grid_search = grid_search.fit(X_train, y_train, X_val=X_test, y_val=y_test, scoring='neg_mean_squared_error') best_parameters = grid_search.get_params() #best_mse = (-grid_search.best_score_)**(1/2) #best_parameters = grid_search.best_params_
def xgb_reg(n_estimators=100, max_depth=6, learning_rate=0.05, k=5, train_data_path='../data/training_data.csv', save_model=False, tracking_uri="http://0.0.0.0:5000"): # Log the parameters with mlflow mlflow.log_param("n_estimators", n_estimators) mlflow.log_param("max_depth", max_depth) mlflow.log_param("learning_rate", learning_rate) mlflow.set_tag("k", k) # Set random seed for reproducibility np.random.seed(RANDOM_SEED) random.seed(RANDOM_SEED) # Get data shuffled and split into training and test sets mdr = MiningDataReader(path=train_data_path) (variable_names, X_train, X_test, y_train, y_test) = mdr.get_splitted_data() pipeline = Pipeline(steps=[('scaling', StandardScaler()), ('regression', xgb.XGBRegressor(objective="reg:squarederror", seed=RANDOM_SEED))]) ### TRAINING ### ################ # Generate grid search for hyperparam tuning hyperparams = {} hyperparams['regression__n_estimators'] = np.arange( n_estimators[0], n_estimators[1], n_estimators[2]) hyperparams['regression__max_depth'] = np.arange(max_depth[0], max_depth[1], max_depth[2]) hyperparams['regression__learning_rate'] = learning_rate print("Training started...\n") # Create an instance of Random Forest Regressor and fit the data for the grid parameters using all processors modelCV = GridSearchCV(estimator=pipeline, param_grid=hyperparams, cv=k, scoring='neg_mean_squared_error', n_jobs=-1) with ProgressBar(): modelCV.fit(X_train, y_train) # Iterate over the results storing training error for each hyperparameter combination results = modelCV.cv_results_ param_list, training_err_list, training_dev_list = [], [], [] for i in range(len(results['params'])): param = results['params'][i] score = (-1) * results['mean_test_score'][i] # NEGATIVE MSE std = results['std_test_score'][i] param_list.append(param) training_err_list.append(score) training_dev_list.append(std) best_params = modelCV.best_params_ print(f"\nBest parameter set found for the training set:\n{best_params}") # Store the index of the best combination best_index = param_list.index(best_params) # Get the best values for hyperparams best_n_estimators = best_params['regression__n_estimators'] best_max_depth = best_params['regression__max_depth'] best_learning_rate = best_params['regression__learning_rate'] print("\nTraining finished. Evaluating model...\n") ### EVALUATION ### ################## # Criteria is C criteria = 'n_estimators' mlflow.set_tag("criteria", criteria) param_values = hyperparams['regression__n_estimators'] # Predict test data variying criteria param and evaluate the models training_err_by_criteria, training_dev_by_criteria, test_err_list = [], [], [] rmse_score, mae_score, r2_score = -1, -1, -1 feature_names, feature_importances = [], [] for param_value in tqdm(param_values): model = Pipeline( steps=[('scaler', StandardScaler()), ('regression', xgb.XGBRegressor(objective="reg:squarederror", n_estimators=param_value, max_depth=best_max_depth, learning_rate=best_learning_rate))]) param = { 'regression__n_estimators': param_value, 'regression__max_depth': best_max_depth, 'regression__learning_rate': best_learning_rate } # Fit model and evaluate results model.fit(X_train, y_train) prediction = model.predict(X_test) index = param_list.index(param) training_err = training_err_list[index] training_dev = training_dev_list[index] (training_mse, test_mse, rmse, mae, r2) = get_test_metrics(training_err, y_test, prediction) # Store metrics training_err_by_criteria.append(training_mse) training_dev_by_criteria.append(training_dev) test_err_list.append(test_mse) # Set aditional metrics for the best combination if index == best_index: rmse_score = rmse mae_score = mae r2_score = r2 # Generate the plots empty_img_folder() plot_errors(criteria, param_values, training_err_by_criteria, training_dev_by_criteria, test_err_list) # Once hyperparameters are selected, train and save the best model if save_model: print( "\nEvaluation finished. Training final model with train + test data with the best hyperparameters..." ) final_model = Pipeline( steps=[('scaler', StandardScaler()), ('regression', xgb.XGBRegressor(objective="reg:squarederror", n_estimators=best_n_estimators, max_depth=best_max_depth, learning_rate=best_learning_rate))]) # Train the best model with all the data (training + test) full_X = np.vstack((X_train, X_test)) full_y = np.concatenate((y_train, y_test)) final_model.fit(full_X, full_y) # Plot importances and final tree ax = xgb.plot_importance(final_model.named_steps['regression']) fig = ax.figure fig.savefig('./img/importances.png', bbox_inches='tight') plt.close(fig) ax = xgb.plot_tree(final_model.named_steps['regression'], rankdir='LR') fig = ax.figure fig.set_size_inches(30, 15) fig.savefig('./img/tree.png', dpi=400, bbox_inches='tight') plt.close(fig) # Log plots and model with mlflow mlflow.log_artifacts('./img') mlflow.sklearn.log_model(final_model, 'model') # Log results with mlflow mlflow.log_metric("train_mse", training_err_list[best_index]) mlflow.log_metric("test_mse", min(test_err_list)) mlflow.log_metric("rmse", rmse_score) mlflow.log_metric("mae", mae_score) mlflow.log_metric("r2", r2_score) mlflow.set_tag("best_params", param_list[best_index]) # Output the results print(f''' ----------------------------------------------------------------------------------------------------------------------- RESULTS ----------------------------------------------------------------------------------------------------------------------- Best params: {param_list[best_index]} Training MSE: {training_err_list[best_index]} Test MSE: {min(test_err_list)} RMSE: {rmse_score} MAE: {mae_score} R2: {r2_score} ----------------------------------------------------------------------------------------------------------------------- ''')
def return_weights_from_xgboost( geodataframe, raster_path, pop_string, codes=[21, 22, 23, 24], n_pixels_option_values=256, tuned_xgb=False, gbm_hyperparam_grid={ "learning_rate": [0.001, 0.01, 0.1], "n_estimators": [200], "subsample": [0.3, 0.5], "max_depth": [4, 5, 6], "num_boosting_rounds": [10, 20], }, force_crs_match=True, na_value=255, ReLU=True, ): """Function that returns the weights of each land type according to NLCD types/codes given by Extreme Gradient Boost model (XGBoost) Parameters ---------- geodataframe : a geopandas geoDataFrame used to build regression raster_path : the path to the associated raster image. pop_string : the name of the variable on geodataframe that the regression shall be conducted codes : an integer list of codes values that should be considered as 'populated' from the National Land Cover Database (NLCD). The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html The default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity). n_pixels_option_values : number of options of the pixel values of rasterior. Default is 256. tuned_xgb : bool. Default is False. If True the XGBoost model will be tuned making a grid search using gbm_hyperparam_grid dictionary a picking the best model in terms of mean squared error with some pre-defined number of cross-validation. Otherwise, the XGBoost model is fitted with default values of xgboost.train function from xgboost Python library. gbm_hyperparam_grid : a dictionary that represent the grid for the grid search of XGBoost. force_crs_match : bool. Default is True. Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file. It is recommended to let this argument as True. na_value : int. Default is 255. The number which is considered to be 'Not a Number' (NaN) in the raster pixel values. ReLU : bool. Default is True. Wheter the Rectified Linear Units (ReLU) transformation will be used to avoid negative weights for the land types. Notes ----- 1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function. 1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function. 2) The pixel value, usually, ranges from 0 to 255. That is why the default of 'n_pixels_option_values' is 256. 3) The returning weights represent the average of the Shapley's values from each feature. """ try: import xgboost as xgb import shap except ImportError as e: raise ImportError("xgboost and shap are required to perform this.") _check_presence_of_crs(geodataframe) if na_value in codes: raise ValueError("codes should not assume the na_value value.") profiled_df = fast_append_profile_in_gdf( geodataframe[["geometry", pop_string]], raster_path, force_crs_match ) # Use only two columns to build the weights (this avoids error, if the original dataset has already types appended on it). # If the list is unsorted, the codes will be sorted to guarantee that the position of the weights will match codes.sort() str_codes = [str(i) for i in codes] feature_names = ["Type_" + s for s in str_codes] y = profiled_df[pop_string] X = profiled_df[feature_names] if tuned_xgb == False: # Create the DMatrix xgb_dmatrix = xgb.DMatrix(X, y) # Create the parameter dictionary params = {"objective": "reg:linear"} # Train the model xg_reg = xgb.train(params=params, dtrain=xgb_dmatrix) if tuned_xgb == True: try: from sklearn.model_selection import GridSearchCV except ImportError as e: raise ImportError("sklearn is required to perform this.") gbm = xgb.XGBRegressor() grid_mse = GridSearchCV( estimator=gbm, param_grid=gbm_hyperparam_grid, scoring="neg_mean_squared_error", cv=4, # 4-fold crossvalidation verbose=3, # Prints the grid search profile n_jobs=-1, ) # Process the GridSearch in parallel all cores availables # Fit the grid to the data grid_mse.fit(X, y) best_params = grid_mse.best_params_ best_params["objective"] = "reg:linear" # Create the DMatrix xgb_dmatrix = xgb.DMatrix(X, y) # Train the model from the best parameters of the grid search xg_reg = xgb.train(params=best_params, dtrain=xgb_dmatrix) # Build explainer and fit Shapley's values (https://github.com/slundberg/shap) explainer = shap.TreeExplainer(xg_reg, feature_dependence="independent") shap_values = explainer.shap_values(X) weights_from_xgb = shap_values.mean( axis=0) # This is already sorted by pixel Type weights = np.zeros(n_pixels_option_values) weights[codes] = list(weights_from_xgb) # Convert to list a dict_values if ReLU: weights = np.where(weights < 0, 0, weights) return weights
from ay_hw_4._global import ROOT_PATH, CRIME from ay_hw_4.util_data import load_data, train_test_split_by_size if __name__ == "__main__": warnings.simplefilter(action='ignore', category=FutureWarning) X_data, y_data = load_data(ROOT_PATH + CRIME, skip_first_column=5, y_column_index=-1, needImpute=True) X_train, X_test, y_train, y_test = train_test_split_by_size( X_data, y_data, train_size=1495, random_state=2333) d_train = xgb.DMatrix(X_train, label=y_train) d_test = xgb.DMatrix(X_test, label=y_test) xgb_clf = xgb.XGBRegressor(n_estimators=100, max_depth=6, objective="reg:squarederror", silent=False) parameters = {'reg_alpha': [1e-5, 1e-4, 1e-3, 0.01, 0.1]} grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1) grid_search.fit(X_train, y_train) print("Best parameters alpha :", grid_search.best_params_) xgb.plot_tree(grid_search.best_estimator_, num_trees=1) fig = matplotlib.pyplot.gcf() fig.set_size_inches(24, 6) matplotlib.pyplot.show()
def regression_fit_and_predict( X_train, y_train, X_val, y_val, X_test, num_prev_frames = CONFIG.NUM_PREV_FRAMES ): """ fit regression models """ if CONFIG.REGRESSION_MODEL == 'LR': model = linear_model.LinearRegression() model.fit(X_train,y_train) elif CONFIG.REGRESSION_MODEL == 'LR_L1': alphas = get_alphas( 50, min_pow = -4.2, max_pow = 0.8 ) min_mse = 1000 best_model = [] for i in range(len(alphas)): model = linear_model.Lasso(alpha=alphas[i], max_iter=1e5, tol=1e-3) model.fit(X_train,y_train) y_val_pred = np.clip( model.predict(X_val), 0, 1 ) tmp = 1.0/len(y_val) * np.sum( (y_val-y_val_pred)**2 ) if tmp < min_mse: min_mse = tmp best_model = model model = best_model elif CONFIG.REGRESSION_MODEL == 'LR_L2': alphas = get_alphas( 50, min_pow = -4.2, max_pow = 0.8 ) min_mse = 1000 best_model = [] for i in range(len(alphas)): model = linear_model.Ridge(alpha=alphas[i], max_iter=1000, tol=1e-3) model.fit(X_train,y_train) y_val_pred = np.clip( model.predict(X_val), 0, 1 ) tmp = 1.0/len(y_val) * np.sum( (y_val-y_val_pred)**2 ) if tmp < min_mse: min_mse = tmp best_model = model model = best_model elif CONFIG.REGRESSION_MODEL == 'GB': if False: choosen_metrics = 25 print('number of metrics for gradient boosting:', choosen_metrics) coefs_model = np.zeros((X_train.shape[1])) num_metrics = int(X_train.shape[1]/(num_prev_frames+1)) coef_metrics = np.zeros((num_metrics)) for k in range(5): model = xgb.XGBRegressor(max_depth=5, colsample_bytree=0.5, n_estimators=100, reg_alpha=0.4, reg_lambda=0.4) model.fit( X_train, y_train ) importance_metrics = abs(np.array(model.feature_importances_)) for l in range(num_prev_frames+1): coef_metrics += importance_metrics[num_metrics*l:num_metrics*(l+1)] index_coefs = np.argsort(coef_metrics)[0:choosen_metrics] X_train_new = np.zeros((X_train.shape[0], choosen_metrics*(num_prev_frames+1))) X_val_new = np.zeros((X_val.shape[0], choosen_metrics*(num_prev_frames+1))) X_test_new = np.zeros((X_test.shape[0], choosen_metrics*(num_prev_frames+1))) counter = 0 for k in range(num_metrics): if k in index_coefs: for l in range(num_prev_frames+1): X_train_new[:,counter] = X_train[:,num_metrics*l+k] X_val_new[:,counter] = X_val[:,num_metrics*l+k] X_test_new[:,counter] = X_test[:,num_metrics*l+k] counter += 1 model = xgb.XGBRegressor(max_depth=5, colsample_bytree=0.5, n_estimators=100, reg_alpha=0.4, reg_lambda=0.4) model.fit( X_train_new, y_train ) importance_metrics = np.array(model.feature_importances_) counter = 0 for k in range(num_metrics): if k in index_coefs: for l in range(num_prev_frames+1): coefs_model[num_metrics*l+k] = importance_metrics[counter] counter += 1 X_train = X_train_new.copy() X_val = X_val_new.copy() X_test = X_test_new.copy() else: model = xgb.XGBRegressor(max_depth=5, colsample_bytree=0.5, n_estimators=100, reg_alpha=0.4, reg_lambda=0.4) model.fit( X_train, y_train ) elif CONFIG.REGRESSION_MODEL == 'NN_L1': num_metrics = int(X_train.shape[1]/(num_prev_frames+1)) # (components, num_prev_frames+1, number of metrics) X_train = X_train.reshape(X_train.shape[0], num_prev_frames+1, num_metrics ) X_val = X_val.reshape(X_val.shape[0], num_prev_frames+1, num_metrics ) X_test = X_test.reshape(X_test.shape[0], num_prev_frames+1, num_metrics ) print('X_train and X_val shape', X_train.shape, X_val.shape) input_shape = (X_train.shape[1], X_train.shape[2]) inp = Input(input_shape) weight=1e-4 dropout=0.25 y = inp y = Conv1D(filters=16, kernel_size=(5,), padding='same', strides=1, kernel_regularizer=regularizers.l1(weight), activation='relu')(inp) y = Flatten()(y) y = Dense( 50, kernel_regularizer=regularizers.l1(weight), activation='relu' )(y) y = Dense( 1 )(y) model = Model(inputs=inp,outputs=y) model.summary() model.compile(optimizer='adam', loss='mean_squared_error', metrics=[]) model.fit(X_train, y_train, epochs=200, validation_data=(X_val,y_val), batch_size=128) elif CONFIG.REGRESSION_MODEL == 'NN_L2': num_metrics = int(X_train.shape[1]/(num_prev_frames+1)) # (components, num_prev_frames+1, number of metrics) X_train = X_train.reshape(X_train.shape[0], num_prev_frames+1, num_metrics ) X_val = X_val.reshape(X_val.shape[0], num_prev_frames+1, num_metrics ) X_test = X_test.reshape(X_test.shape[0], num_prev_frames+1, num_metrics ) print('X_train and X_val shape', X_train.shape, X_val.shape) input_shape = (X_train.shape[1], X_train.shape[2]) inp = Input(input_shape) wdecay=1e-3 dropout=0.25 y = inp y = Conv1D(filters=16, kernel_size=(5,), padding='same', strides=1, kernel_regularizer=regularizers.l2(wdecay), activation='relu')(inp) y = Flatten()(y) y = Dense( 50, kernel_regularizer=regularizers.l2(wdecay), activation='relu' )(y) y = Dense( 1 )(y) model = Model(inputs=inp,outputs=y) model.summary() model.compile(optimizer='adam', loss='mean_squared_error', metrics=[]) model.fit(X_train, y_train, epochs=200, validation_data=(X_val,y_val), batch_size=128) print(model.summary()) y_train_pred = np.clip( model.predict(X_train), 0, 1 ) y_val_pred = np.clip( model.predict(X_val), 0, 1 ) y_test_R_pred = np.clip( model.predict(X_test), 0, 1 ) return y_train_pred, y_val_pred, y_test_R_pred, model
tree.DecisionTreeClassifier(**TREE_PARAMS), utils.train_model_classification, ), ( "regression", "random_forest", ensemble.RandomForestRegressor(**FOREST_PARAMS), utils.train_model_regression, ), ( "classification", "random_forest", ensemble.RandomForestClassifier(**FOREST_PARAMS), utils.train_model_classification, ), ( "regression", "xgboost", xgboost.XGBRegressor(**XGBOOST_PARAMS), utils.train_model_regression, ), ( "classification", "xgboost", xgboost.XGBClassifier(**XGBOOST_PARAMS), utils.train_model_classification, ), ( "regression", "lightgbm", lightgbm.LGBMRegressor(**LIGHT_GBM_PARAMS), utils.train_model_regression, ), ( "classification", "lightgbm", lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS),
def search(self, feature, label, vaild_data=None, sample_weight=None, metrics=mean_squared_error, loss='reg:squarederror', scoring=0.5, cv=5, cv_num=3, metrics_min=True, speedy=True, speedy_param=(20000, 0.3), gpu_id=-1, save_model_dir=None, save_model_name='xgb'): """XGBRegressor model params search use GridSearch method. Args: feature: pandas dataframe, model's feature. label: pandas series, model's label. vaild_data: A list of (X, y, sample_weight) tuple pairs to use as validation sets, for which metrics will be computed. sample_weight: pd.Series or np.array, sample weight, shape is (n,). metrics: model metrics function, default is `la.metircs.mean_squared_error`. loss: XGBRegressor param 'objective'. scoring: metrics error opt base line value. cv: cross validation fold. cv_num: if use speedy method, minimum cross validation fold. metrics_min: metrics value whether the smaller the better. speedy: whether use speedy method. speedy_param: if use speedy method, test_size will be set, test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2). gpu_id: int, use gpu device ordinal, -1 is not use gpu. save_model_dir: str, save model folder. save_model_name: str, save model name prefix, "`xgb`_model.json" and "`xgb`_params.json". Returns: a best XGBRegressor model params dict. Raises: params error. """ import warnings warnings.filterwarnings("ignore") import xgboost as xgb assert xgb.__version__ >= __xgboost_version__, f'xgboost version should be >={__xgboost_version__}.' logger = Logger(name='xgb') logger.info(f"api is deprecated and will be removed in 1.5.0") logger.info(f"please use la.param_search.GridSearch") if speedy: test_size = 1 - round( min(speedy_param[0], feature.shape[0] * speedy_param[1]) / feature.shape[0], 2) tree_method = ['gpu_hist'] if gpu_id > -1 else [ 'auto', 'exact', 'approx', 'hist' ] n_job = int(np.ceil(cpu_count() * 0.8)) self.HyperParameter.Choice('n_jobs', [n_job]) self.HyperParameter.Choice('objective', [loss]) self.HyperParameter.Choice('tree_method', tree_method) self.HyperParameter.Choice('gpu_id', [gpu_id]) if vaild_data is not None: cv_score_list = [] logger.info(f"Start XGBRegressor hyperparameter grid search.") nums = self.HyperParameter.cardinality() for i in range(1, nums + 1): self.HyperParameter.update(self.best_params) model = xgb.XGBRegressor(**self.HyperParameter.params) score = [] if speedy: for _ in range(cv_num): index_list = train_test_split(feature, test_size=test_size, shuffle=True, seed=np.random.choice( range(100), 1)[0]) weight = None if sample_weight is None else sample_weight[ index_list[0]] model.fit(feature.loc[index_list[0]], label[index_list[0]], sample_weight=weight) cv_pred = pd.Series(model.predict( feature.loc[index_list[1]]), index=label[index_list[1]].index) if sample_weight is None: score.append(metrics(label[index_list[1]], cv_pred)) else: score.append( metrics(label[index_list[1]], cv_pred, sample_weight=sample_weight)) else: index_list = kfold(feature, n_splits=cv, shuffle=True, seed=np.random.choice(range(100), 1)[0]) for n, index in enumerate(index_list): weight = None if sample_weight is None else sample_weight[ index[0]] model.fit(feature.loc[index[0]], label[index[0]], sample_weight=weight) cv_pred = pd.Series(model.predict(feature.loc[index[1]]), index=label[index[1]].index) if sample_weight is None: score.append(metrics(label[index[1]], cv_pred)) else: score.append( metrics(label[index[1]], cv_pred, sample_weight=sample_weight)) cv_score = np.mean(score) if vaild_data is not None: cv_score_list.append(cv_score) if metrics_min: cv_score_list.sort() if cv_score_list[int( len(cv_score_list) * 0.2)] >= cv_score: cv_pred = pd.Series(model.predict(vaild_data[0]), index=vaild_data[1].index) if len(vaild_data) == 2: cv_score = metrics(vaild_data[1], cv_pred) else: cv_score = metrics(vaild_data[1], cv_pred, sample_weight=vaild_data[2]) else: logger.info( f"Grid search progress: {i/nums*100:.1f}%, best score: {scoring:.4f}", enter=False if i < nums else True) continue else: cv_score_list.sort(reverse=1) if cv_score_list[int( len(cv_score_list) * 0.2)] <= cv_score: cv_pred = pd.Series(model.predict(vaild_data[0]), index=vaild_data[1].index) cv_score = metrics(vaild_data[1], cv_pred) else: logger.info( f"Grid search progress: {i/nums*100:.1f}%, best score: {scoring:.4f}", enter=False if i < nums else True) continue if metrics_min: if cv_score < scoring: scoring = cv_score self.best_params = self.HyperParameter.params.copy() self.best_params_history[i] = { 'score': scoring, 'best_params': self.best_params.copy() } if save_model_dir is not None: model.save_model( os.path.join(save_model_dir, f"{save_model_name}_model.json")) with open( os.path.join(save_model_dir, f"{save_model_name}_params.json"), 'w') as f: json.dump(best_params, f) else: if cv_score > scoring: scoring = cv_score self.best_params = self.HyperParameter.params.copy() self.best_params_history[i] = { 'score': scoring, 'best_params': self.best_params.copy() } if save_model_dir is not None: model.save_model( os.path.join(save_model_dir, f"{save_model_name}_model.json")) with open( os.path.join(save_model_dir, f"{save_model_name}_params.json"), 'w') as f: json.dump(best_params, f) logger.info( f"Grid search progress: {i/nums*100:.1f}%, best score: {scoring:.4f}", enter=False if i < nums else True) logger.info(f"XGBRegressor grid search best score: {scoring:.4f}", close=True, time_mode=1) return self.best_params
#### ALL INPUTS FEATURES # # # - - - XGB - - - # # # model = 'xgb3' best_parameters3 = { 'colsample_bytree': 0.85, 'learning_rate': 0.02, 'max_depth': 10, 'min_child_weight': 3, 'n_estimators': 700, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.85 } regression = xgb.XGBRegressor(**best_parameters3) X_train = df.drop(['calories_per_ha'], axis=1) y_train = df['calories_per_ha'] regression.fit(X_train, y_train) save_model('../ipbes_invest_crop_yield_project/output/Models/' + model + '.sav') y_predicted = regression.predict(X_validation) R2_validation = sklearn.metrics.r2_score(y_validation, y_predicted) validation_R2 = validation_R2.append( { 'Model': model, 'Validation_R2': R2_validation }, ignore_index=True)
from sklearn.feature_selection import SelectFromModel import numpy as np import xgboost as xgb from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split, RandomizedSearchCV from sklearn.metrics import r2_score dataset = load_boston() x = dataset.data y = dataset.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=66) model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.01) model.fit(x_train, y_train, verbose=True, eval_metric='rmse', eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20) result = model.evals_result() print(result) y_pred = model.predict(x_test) score = r2_score(y_test, y_pred) print(score)
lrs = [ linear_model.BayesianRidge(), linear_model.ARDRegression(), linear_model.ElasticNet(), linear_model.HuberRegressor(), linear_model.Lars(), linear_model.LinearRegression(), linear_model.LogisticRegression(), linear_model.PassiveAggressiveRegressor(), linear_model.RandomizedLogisticRegression(), linear_model.RANSACRegressor(), linear_model.Ridge(), linear_model.SGDRegressor(), linear_model.TheilSenRegressor(), xgb.XGBRegressor(learning_rate=0.1, reg_alpha=1), xgb.XGBRegressor(learning_rate=0.2, reg_alpha=1), xgb.XGBRegressor(learning_rate=0.2), ensemble.AdaBoostRegressor(), ensemble.BaggingRegressor(), ensemble.ExtraTreesRegressor(n_estimators=100), ensemble.GradientBoostingRegressor(), ensemble.RandomForestRegressor(n_estimators=100) ] best_lr = None lr = xgb.XGBRegressor(learning_rate=0.1, max_depth=2, reg_alpha=1) #linear_model.BayesianRidge() cv = model_selection.cross_val_score(lr, v1, v.y, cv=10,
gbm_gridsearch.fit(x_train_values, y_train_values) gbm_best_model_predictions = gbm_gridsearch.best_estimator_.predict( x_test_values) generate_submission_file( gbm_best_model_predictions, test_data["Id"], "../results/" + user + "_Gradient_Boosted_Machines_GridSearchCV.csv") ##################################################################### ## XGBoost ## ##################################################################### ##################################################################### ### Weak Learner is a Tree ### ##################################################################### xgb_model = xgb.XGBRegressor() xgb_model.fit(x_train_values, y_train_values) xgb_model_predictions = xgb_model.predict(x_test_values) generate_submission_file(xgb_model_predictions, test_data["Id"], "../results/" + user + "_XGBoost_Basic.csv") param_grid = { "max_depth": [2, 4, 6], "n_estimators": np.linspace(100, 500, 5, dtype=np.int64) } xgb_grid_search = GridSearchCV(xgb.XGBRegressor(objective="reg:linear"), param_grid) xgb_grid_search.fit(x_train_values, y_train_values) xgb_model_predictions = xgb_grid_search.predict(x_test_values) generate_submission_file(xgb_model_predictions, test_data["Id"], "../results/" + user + "_XGBoost_GridSearchCV.csv")
class HousePrices(object): seq2 = pd.Series(np.arange(2)) #Static class models. lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)) model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3, min_child_weight=1.7817, n_estimators=2200, reg_alpha=0.4640, reg_lambda=0.8571, subsample=0.5213, silent=1, random_state=7, nthread=-1) GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=6, min_sum_hessian_in_leaf=11) KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) #Constructor def __init__(self, trainData, testData): self.trainData = trainData self.testData = testData def dataImport(self): self.train = pd.read_csv(self.trainData) self.test = pd.read_csv(self.testData) self.train_Id = self.train['Id'] self.test_Id = self.test['Id'] self.train.drop("Id", axis=1, inplace=True) self.test.drop("Id", axis=1, inplace=True) def display(self): print(len(self.train.columns)) fig, ax = plt.subplots() ax.scatter(x=self.train['GrLivArea'], y=self.train['SalePrice']) plt.ylabel('SalePrice', fontsize=13) plt.xlabel('GrLivArea', fontsize=13) #plt.show() # corrmat = self.train.corr() # f, ax = plt.subplots(figsize=(12, 9)) # sns.heatmap(self.corrmat, vmax=.8, square=True); plt.show() # sns.distplot(self.train['SalePrice'] , fit=norm); # # Get the fitted parameters used by the function # (mu, sigma) = norm.fit(self.train['SalePrice']) # print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) # #Now plot the distribution # plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],loc='best') # plt.ylabel('Frequency') # plt.title('SalePrice distribution') # #Get also the QQ-plot # fig = plt.figure() # res = stats.probplot(self.train['SalePrice'], plot=plt) # plt.show() # f, ax = plt.subplots(figsize=(15, 12)) # plt.xticks(rotation='90') # sns.barplot(x=self.all_data_na.index, y=self.all_data_na) # plt.xlabel('Features', fontsize=15) # plt.ylabel('Percent of missing values', fontsize=15) # plt.title('Percent missing data by feature', fontsize=15) #plt.show() def removeOutliers(self): self.train = self.train.drop( self.train[(self.train['GrLivArea'] > 4000) & (self.train['SalePrice'] < 300000)].index) def preProcess(self): self.removeOutliers() self.train['SalePrice'] = np.log1p(self.train['SalePrice']) self.ntrain = self.train.shape[0] self.ntest = self.test.shape[0] self.y_train = self.train.SalePrice.values self.all_data = pd.concat( (self.train, self.test)).reset_index(drop=True) self.all_data.drop(['SalePrice'], axis=1, inplace=True) print("all_data size is : {}".format(self.all_data.shape)) self.all_data_na = (self.all_data.isnull().sum() / len(self.all_data)) * 100 self.all_data_na = self.all_data_na.drop( self.all_data_na[self.all_data_na == 0].index).sort_values( ascending=False)[:30] self.missing_data = pd.DataFrame({'Missing Ratio': self.all_data_na}) self.preprocessCategoricalColumns() self.preProcessNumericalColumns() def preprocessCategoricalColumns(self): #Converting PoolQC column to categorical and then using a probability distribution to fill the None values. print("Total Number of values ", self.all_data['PoolQC'].shape[0]) print("Number of Null Values", self.all_data['PoolQC'].isna().sum()) # # PoolQC # # #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN # as one of the values in the categorical column. # (1) Filling NaN with None values and make the column categorical self.all_data["PoolQC"] = self.all_data.PoolQC.fillna("None") self.all_data['PoolQC'] = pd.Categorical(self.all_data.PoolQC) # (2) Finding probabilities of each occurance print("Before filling :") print(self.all_data['PoolQC'].value_counts()) self.poolQC_probabilities = [ 0.98, 0.006666667, 0.006666667, 0.006666667 ] self.poolQC_Values = ['None', 'Gd', 'Fa', 'Ex'] #We need to replace only the 'None' type. Generating a sample from probability distribution self.indices = self.all_data[self.all_data['PoolQC'] == 'None'].index # (3) Use a distribution to fill out "None" values now. self.all_data.iloc[self.indices, 65] = np.random.choice(self.poolQC_Values, len(self.indices), p=self.poolQC_probabilities) print("After filling :") print(self.all_data.PoolQC.value_counts()) ############################################################################################ # # MiscFeature # # #Number of Missing values in MiscFeature self.all_data.MiscFeature.isna().sum( ) # 1404 Null values in this column #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN # as one of the values in the categorical column. # (1) Filling NaN with None values and make the column categorical self.all_data["MiscFeature"] = self.all_data['MiscFeature'].fillna( "None") self.all_data['MiscFeature'] = pd.Categorical( self.all_data['MiscFeature']) self.all_data.MiscFeature = self.all_data.MiscFeature.astype( 'category') # print("Before Filling :") # print(self.all_data['MiscFeature'].value_counts()) # (2) Finding probabilities of each occurance print(self.all_data['MiscFeature'].value_counts()) self.MiscFeature_probabilities = [ 0.962962963, 0.033607682, 0.001371742, 0.001371742, 0.000685871 ] self.MiscFeature_Values = ['None', 'Shed', 'Othr', 'Gar2', 'TenC'] #We need to replace only the 'None' type. Generating a sample from probability distribution self.indices = self.all_data[self.all_data['MiscFeature'] == 'None'].index #Find the column index so as to use 'iloc' . 56 is the col np.argwhere(self.all_data.columns == 'MiscFeature') # (3) Use a distribution to fill out "None" values now. self.all_data.iloc[self.indices, 56] = np.random.choice( self.MiscFeature_Values, len(self.indices), p=self.MiscFeature_probabilities) # print("After filling") # print(self.all_data["MiscFeature"].value_counts()) ############################################################################################ # # Alley # # #Number of Missing values in Alley self.all_data['Alley'].isna().sum() # 1367 Null values in this column #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN # as one of the values in the categorical column. # (1) Filling NaN with None values and make the column categorical self.all_data["Alley"] = self.all_data['Alley'].fillna("None") self.all_data['Alley'] = pd.Categorical(self.all_data['Alley']) # (2) Finding probabilities of each occurance print("Before filling :") print(self.all_data['Alley'].value_counts()) # Count of 'None' : 1367 # Count of 'Grvl' : 50 # Count of 'Pave' : 41 self.Alley_probabilities = [0.937585734, 0.034293553, 0.028120713] self.Alleyy_Values = ['None', 'Grvl', 'Pave'] #We need to replace only the 'None' type. Generating a sample from probability distribution self.indices = self.all_data[self.all_data['Alley'] == 'None'].index #Find the column index so as to use 'iloc' . 3 is the col np.argwhere(self.all_data.columns == 'Alley') # (3) Use a distribution to fill out "None" values now. self.all_data.iloc[self.indices, 3] = np.random.choice(self.Alleyy_Values, len(self.indices), p=self.Alley_probabilities) print("gg") self.all_data['Alley'].value_counts() print("After filling :") print(self.all_data['Alley'].value_counts()) ########################################################################################### # # Fence # # #Number of Missing values in Alley self.all_data['Fence'].isna().sum() # 1177 Null values in this column #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN # as one of the values in the categorical column. # (1) Filling NaN with None values and make the column categorical self.all_data["Fence"] = self.all_data['Fence'].fillna("None") self.all_data['Fence'] = pd.Categorical(self.all_data['Fence']) # (2) Finding probabilities of each occurance print("Before filling :") print(self.all_data['Fence'].value_counts()) # Count of 'None' : 1177 # Count of 'MnPrv' : 157 # Count of 'GdPrv' : 59 # Count of 'GdWo' : 54 # Count of 'MnWw' : 11 self.Fence_probabilities = [ 0.807270233, 0.107681756, 0.040466392, 0.037037037, 0.007544582 ] self.Fence_Values = ['None', 'MnPrv', 'GdPrv', 'GdWo', 'MnWw'] #We need to replace only the 'None' type. Generating a sample from probability distribution self.indices = self.all_data[self.all_data['Fence'] == 'None'].index #Find the column index so as to use 'iloc' . 25 is the col np.argwhere(self.all_data.columns == 'Fence') # (3) Use a distribution to fill out "None" values now. self.all_data.iloc[self.indices, 25] = np.random.choice(self.Fence_Values, len(self.indices), p=self.Fence_probabilities) print("After filling :") print(self.all_data['Fence'].value_counts()) ######################################################################################### # # FirePlaceQu # # #Number of Missing values in FireplaceQu self.all_data['FireplaceQu'].isna().sum( ) # 690 Null values in this column #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN # as one of the values in the categorical column. # (1) Filling NaN with None values and make the column categorical self.all_data["FireplaceQu"] = self.all_data['FireplaceQu'].fillna( "None") self.all_data['FireplaceQu'] = pd.Categorical( self.all_data['FireplaceQu']) # (2) Finding probabilities of each occurance print("Before filling :") print(self.all_data['FireplaceQu'].value_counts()) # Count of 'None' : 690 # Count of 'Gd' : 378 # Count of 'TA' : 313 # Count of 'Fa' : 33 # Count of 'Ex' : 24 # Count of 'Po' : 20 self.FireplaceQu_probabilities = [ 0.473251029, 0.259259259, 0.214677641, 0.022633745, 0.016460905, 0.013717421 ] self.FireplaceQu_Values = ['None', 'Gd', 'TA', 'Fa', 'Ex', 'Po'] #We need to replace only the 'None' type. Generating a sample from probability distribution self.indices = self.all_data[self.all_data['FireplaceQu'] == 'None'].index #Find the column index so as to use 'iloc' . 26 is the col np.argwhere(self.all_data.columns == 'FireplaceQu') # (3) Use a distribution to fill out "None" values now. self.all_data.iloc[self.indices, 26] = np.random.choice( self.FireplaceQu_Values, len(self.indices), p=self.FireplaceQu_probabilities) print("After filling :") print(self.all_data['FireplaceQu'].value_counts()) ########################################################################################### # # LotFrontage # # ''' Assuming houses belonging to the same Neighborhood will have similar LotFrontage, we groupby Neighborhood and then take mean for each locality. Then we substitute the missing values of a particular Neighborhood with the mean of that Neighborhood ''' self.lotFrontage_df = self.all_data[['Neighborhood', 'LotFrontage']].copy() self.groupby_Neighborhood = self.lotFrontage_df.groupby('Neighborhood') self.indices = self.all_data[self.all_data['LotFrontage'].isna()].index self.mean_Neighborhood = self.groupby_Neighborhood.mean() self.mean_Neighborhood.head() for i in self.indices: self.locality = self.all_data.iloc[i, 59] self.value = self.mean_Neighborhood.get_value( self.locality, 'LotFrontage') self.all_data.iloc[i, 49] = self.value ########################################################################################### # # # (6)GarageYrBlt (7) GarageArea (8) GarageCar # # (9)GarageType (10) GarageFinish (11) GarageQual (12)GarageCond for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'): self.all_data[col] = self.all_data[col].fillna(0) self.all_data['GarageType'] = self.all_data['GarageType'].fillna( 'None') self.all_data['GarageFinish'] = self.all_data['GarageFinish'].fillna( 'None') self.all_data['GarageQual'] = self.all_data['GarageQual'].fillna( 'None') self.all_data['GarageCond'] = self.all_data['GarageCond'].fillna( 'None') for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'): self.all_data[col] = self.all_data[col].fillna(0) for col in ('BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual'): self.all_data[col] = self.all_data[col].fillna('None') ############################################################################################# # # # Electrical , Exterior1st,Exterior2nd,SaleType,KitchenQual # # #Electrical has only 1 Null value , hence replacing by most frequently occuring value i.e. mode of the column self.all_data['Electrical'] = self.all_data['Electrical'].fillna( self.all_data['Electrical'].mode()[0]) #Similarly for Exterior1st, Exterior2nd,SaleType and KitchenQual self.all_data['Exterior1st'] = self.all_data['Exterior1st'].fillna( self.all_data['Exterior1st'].mode()[0]) self.all_data['Exterior2nd'] = self.all_data['Exterior2nd'].fillna( self.all_data['Exterior2nd'].mode()[0]) self.all_data['KitchenQual'] = self.all_data['KitchenQual'].fillna( self.all_data['KitchenQual'].mode()[0]) self.all_data['SaleType'] = self.all_data['SaleType'].fillna( self.all_data['SaleType'].mode()[0]) ############################################################################################## # # # # 'MasVnrArea','MasVnrType' and other columns # # self.indices = self.all_data[self.all_data['MasVnrArea'] == 0].index self.all_data['MasVnrArea'] = self.all_data['MasVnrArea'].fillna(0) self.all_data['MasVnrType'] = self.all_data['MasVnrType'].fillna( 'None') self.all_data = self.all_data.drop(['Utilities'], axis=1) self.all_data["Functional"] = self.all_data["Functional"].fillna("Typ") self.all_data['MSSubClass'] = self.all_data['MSSubClass'].fillna( "None") ############################################################################################## # Hence no remaining Columns with missing values. # MSSubClass is categorical as only a certain set of numbers are appearing. Hence converting it to categorical # OverallCond is categorical as only a certain set of numbers are appearing. Hence converting it to categorical self.all_data['MSSubClass'].unique() #array([ 20, 180, 60, 80, 50, 75, 30, 70, 90, 120, 45, 190, 85, 160, 40]) self.all_data['MSSubClass'] = self.all_data['MSSubClass'].apply(str) self.all_data['OverallCond'].unique() #array([6, 5, 7, 8, 3, 4, 9, 2, 1]) self.all_data['OverallCond'] = self.all_data['OverallCond'].apply(str) #Unlike Yrbuilt , YrSold is taking only a set of numbers converting it to categorical. self.all_data['YrSold'].unique() #array([2008, 2006, 2010, 2007, 2009]) self.all_data['YrSold'] = self.all_data['YrSold'].astype(str) #Similarly for MonthSold ie MoSold self.all_data['MoSold'].unique() #array([ 5, 6, 3, 4, 12, 7, 8, 11, 1, 10, 2, 9]) self.all_data['MoSold'] = self.all_data['MoSold'].astype(str) # Linear regression works only on columns with numeric values , Using labelEncoder to convert # the categorical colums to a numeric values #Set of columns which have categorical values: self.columns = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'ExterQual', 'ExterCond', 'HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope', 'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold') for column in self.columns: self.lbl = LabelEncoder() self.lbl.fit(list(self.all_data[column].values)) self.all_data[column] = self.lbl.transform( list(self.all_data[column].values)) # skewness = skewness[abs(skewness) > 0.75] # print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0])) # from scipy.special import boxcox1p # self.skewed_features = skewness.index # lam = 0.15 # for feat in self.skewed_features: # #all_data[feat] += 1 # self.all_data[feat] = boxcox1p(self.all_data[feat], self.lam) # This will map the labels of categorical data to 0,1,2,3 etc. self.all_data = pd.get_dummies(self.all_data) def preProcessNumericalColumns(self): #These features are positively correlated with the salePrice hence creating new features by #taking 3 polynomials square, cube and square root # Taking the top 10 correlated valuse. # OverallQual 0.817315 # GrLivArea 0.715624 # GarageCars 0.687771 # GarageArea 0.662332 # TotalBsmtSF 0.637558 # 1stFlrSF 0.608198 # FullBath 0.582020 # YearBuilt 0.572574 # As total square feet is important. Adding total sqfootage feature self.all_data[ 'TotalSF'] = self.all_data['TotalBsmtSF'] + self.all_data[ '1stFlrSF'] + self.all_data['2ndFlrSF'] self.all_data["OverallQual-s2"] = self.all_data["OverallQual"]**2 self.all_data["OverallQual-s3"] = self.all_data["OverallQual"]**3 self.all_data["OverallQual-Sq"] = np.sqrt(self.all_data["OverallQual"]) self.all_data["GrLivArea-s2"] = self.all_data["GrLivArea"]**2 self.all_data["GrLivArea-s3"] = self.all_data["GrLivArea"]**3 self.all_data["GrLivArea-Sq"] = np.sqrt(self.all_data["GrLivArea"]) self.all_data["GarageCars-s2"] = self.all_data["GarageCars"]**2 self.all_data["GarageCars-s3"] = self.all_data["GarageCars"]**3 self.all_data["GarageCars-Sq"] = np.sqrt(self.all_data["GarageCars"]) self.all_data["GarageArea-s2"] = self.all_data["GarageArea"]**2 self.all_data["GarageArea-s3"] = self.all_data["GarageArea"]**3 self.all_data["GarageArea-Sq"] = np.sqrt(self.all_data["GarageArea"]) self.all_data["TotalBsmtSF-s2"] = self.all_data["TotalBsmtSF"]**2 self.all_data["TotalBsmtSF-s3"] = self.all_data["TotalBsmtSF"]**3 self.all_data["TotalBsmtSF-Sq"] = np.sqrt(self.all_data["TotalBsmtSF"]) self.all_data["1stFlrSF-s2"] = self.all_data["1stFlrSF"]**2 self.all_data["1stFlrSF-s3"] = self.all_data["1stFlrSF"]**3 self.all_data["1stFlrSF-Sq"] = np.sqrt(self.all_data["1stFlrSF"]) self.all_data["FullBath-s2"] = self.all_data["FullBath"]**2 self.all_data["FullBath-s3"] = self.all_data["FullBath"]**3 self.all_data["FullBath-Sq"] = np.sqrt(self.all_data["FullBath"]) self.all_data["YearBuilt-s2"] = self.all_data["YearBuilt"]**2 self.all_data["YearBuilt-s3"] = self.all_data["YearBuilt"]**3 self.all_data["YearBuilt-Sq"] = np.sqrt(self.all_data["YearBuilt"]) self.all_data["TotalSF-s2"] = self.all_data["TotalSF"]**2 self.all_data["TotalSF-s3"] = self.all_data["TotalSF"]**3 self.all_data["TotalSF-Sq"] = np.sqrt(self.all_data["TotalSF"]) self.train = self.all_data[:1020] self.test = self.all_data[1020:] self.all_data.to_csv('./all.csv') #Validation function def rmsle_cv(self, model): #self.n_folds = 5 self.kf = KFold(5, shuffle=True, random_state=42).get_n_splits(self.train.values) self.rmse = np.sqrt(-cross_val_score(model, self.train.values, self.y_train, scoring="neg_mean_squared_error", cv=self.kf)) return (self.rmse) #Lasso. Best alpha : 0.0005 / 91% accuracy def lasso_model(self): self.lasso_m = Lasso() self.alpha = [0.0005, 0.0003, 0.0007] self.param_grid = dict(alpha=self.alpha) self.grid_search = GridSearchCV(self.lasso_m, self.param_grid, scoring="r2", cv=10) self.grid_result = self.grid_search.fit(self.train, self.y_train) print("Best: %f using %s" % (self.grid_result.best_score_, self.grid_result.best_params_)) self.lasso = self.grid_search.best_estimator_ # #self.lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1)) # #self.score = self.rmsle_cv(self.lasso) # self.score = self.rmsle_cv(HousePrices.lasso) # print("\nLasso score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std())) # ElasticNet. Best Alpha : 0.001 / 91% accuracy. def elasticNet(self): self.enet_m = ElasticNet() self.alpha = [0.0005, 0.0007, 0.001] self.param_grid = dict(alpha=self.alpha) self.grid_search = GridSearchCV(self.enet_m, self.param_grid, scoring="r2", cv=10) self.grid_result = self.grid_search.fit(self.train, self.y_train) print("Best: %f using %s" % (self.grid_result.best_score_, self.grid_result.best_params_)) self.enet_m = self.grid_search.best_estimator_ # #self.ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)) # self.score = self.rmsle_cv(HousePrices.ENet) # print("ElasticNet score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std())) #Kernel Ridge regression. Best alpha : .0005 / 79% accuracy def kernelRegression(self): self.krr_m = KernelRidge() self.alpha = [0.0005, 0.0007, 0.001, 0.0006, 0.0001] self.param_grid = dict(alpha=self.alpha) self.grid_search = GridSearchCV(self.krr_m, self.param_grid, scoring="r2", cv=10) self.grid_result = self.grid_search.fit(self.train, self.y_train) print("Best: %f using %s" % (self.grid_result.best_score_, self.grid_result.best_params_)) self.krr_m = self.grid_search.best_estimator_ # #self.KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) # self.score = self.rmsle_cv(HousePrices.KRR) # print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std())) #GradientBoosting. Best alpha : .00065 / 89% accuracy def gradientBoosting(self): self.gboost_m = GradientBoostingRegressor() self.alpha = [0.00068, 0.00065, 0.00066] self.param_grid = dict(alpha=self.alpha) self.grid_search = GridSearchCV(self.gboost_m, self.param_grid, scoring="r2", cv=10) self.grid_result = self.grid_search.fit(self.train, self.y_train) print("Best: %f using %s" % (self.grid_result.best_score_, self.grid_result.best_params_)) self.krr_m = self.grid_search.best_estimator_ # #self.GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10,loss='huber', random_state =5) # self.score = self.rmsle_cv(HousePrices.GBoost) # print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std())) # XgbRegressor.Best alpha : .0005 / 79% accuracy def xgbRegressor(self): #self.model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,learning_rate=0.05, max_depth=3,min_child_weight=1.7817, n_estimators=2200,reg_alpha=0.4640, reg_lambda=0.8571,subsample=0.5213, silent=1,random_state =7, nthread = -1) self.score = self.rmsle_cv(HousePrices.model_xgb) print("Xgboost score: {:.4f} ({:.4f})\n".format( self.score.mean(), self.score.std())) # LgbRegressor. Best alpha : .0005 / 79% accuracy def lgbRegressor(self): #model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,learning_rate=0.05, n_estimators=720,max_bin = 55, bagging_fraction = 0.8,bagging_freq = 5, feature_fraction = 0.2319,feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf =6, min_sum_hessian_in_leaf = 11) self.score = self.rmsle_cv(HousePrices.model_lgb) print("LgbRegressor score: {:.4f} ({:.4f})\n".format( self.score.mean(), self.score.std())) def rmsle(self, y, y_pred): return np.sqrt(mean_squared_error(y, y_pred)) def stackingModels(self): #Lasso self.lasso_stacking = make_pipeline( RobustScaler(), Lasso(alpha=0.0005, random_state=1)) #ElasticNet self.ENet_stacking = make_pipeline( RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)) #Kernel Ridge regression self.KRR_stacking = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) #GBoost self.GBoost_stacking = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) #Lgb self.lgb_stacking = lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=6, min_sum_hessian_in_leaf=11) #Stacking self.stacked_averaged_models = StackingAveragedModels( base_models=(self.ENet_stacking, self.GBoost_stacking, self.KRR_stacking), meta_model=self.lasso_stacking) self.score = self.rmsle_cv(self.stacked_averaged_models) print("Stacking Averaged models score: {:.4f} ({:.4f})".format( self.score.mean(), self.score.std())) self.stacked_averaged_models.fit(self.train.values, self.y_train) self.stacked_train_pred = self.stacked_averaged_models.predict( self.train.values) self.stacked_pred = np.expm1( self.stacked_averaged_models.predict(self.test.values)) print("RMSE of stacked ") print(self.rmsle(self.y_train, self.stacked_train_pred))
from sklearn.svm import LinearSVC df = load() df = df.loc[:70, :] # print(df) train_X, test_X, train_Y, test_Y = train_test_split( df[['ptt', 'vally_ptt', 'rr1', 'rr2', 'sum1', 'up1', 'down1', 'sum2', 'up2', 'down2']], df['high_pluse']) # XGBoost predictor = xgb.XGBRegressor( learning_rate=0.1, n_estimators=1000, max_depth=8, min_child_weight=1, # gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27) # 线性回归 # predictor = linear_model.LinearRegression() # 支持向量机 liner 线性 poly 多项式 rbf 径向基 # predictor = SVR(kernel='rbf') predictor.fit(train_X, train_Y) y = predictor.predict(test_X) print('predict Y: ', list(y)) print('real Y: ', list(test_Y)) loss = mean_absolute_error(y, test_Y)
model = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=100, nfold=5, metrics='rmse') model.loc[30:, ["test-rmse-mean", "train-rmse-mean"]].plot() print(model['test-rmse-mean'].min()) print(model['train-rmse-mean'].min()) model_xgb = xgb.XGBRegressor(n_estimators=410, learning_rate=0.08, max_depth=2, min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='reg:linear', nthread=4, scale_pos_weight=1, seed=27) #the params were tuned using xgb.cv model_xgb.fit(train_x, train_y) xgb_preds = np.expm1(model_xgb.predict(test_x)) lasso_preds = np.expm1(model_lasso.predict(test_x)) lasso_xgb_preds = 0.7 * lasso_preds + 0.3 * xgb_preds predictions = pd.DataFrame({"xgb": lasso_xgb_preds, "lasso": lasso_preds}) predictions.plot(x="xgb", y="lasso", kind="scatter") preds = 0.7 * lasso_preds + 0.3 * xgb_preds
X = np.array([param_dict[params] for params in param_dict]).reshape(2, -1) return -(X[1] + 47) * np.sin(np.sqrt( np.abs(X[0] / 2.0 + (X[1] + 47)))) - X[0] * np.sin( np.sqrt(np.abs(X[0] - (X[1] + 47)))) + np.random.normal( 0, 0.2, len(X[0])) if __name__ == '__main__': file_id = time.time() step = 30 n = 200 eval_func = eggholder_function input_domain = [-212, 212] xgb = xgboost.XGBRegressor(verbosity=0) init_df = None explorer = Explorer( { 'param1': RandomFloat(input_domain[0], input_domain[1]), 'param2': RandomFloat(input_domain[0], input_domain[1]), }, path="data/out_%d.csv" % file_id) for i in range(0, n, step): init_df = explorer.explore(step, eval_func, init_n=5) X, y = init_df.iloc[:, :-1].values, init_df.iloc[:, -1].values print("Number of data points : %d" % X.shape[0])