def test_regression(self): data, target = load_boston(True) x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42) ngb = NGBoost(Base=default_tree_learner, Dist=Normal, Score=MLE, natural_gradient=True, verbose=False) ngb.fit(x_train, y_train) preds = ngb.predict(x_test) score = mean_squared_error(y_test, preds) assert score <= 8.0
def fold_run(self, src_dir : str, X_train : pd.DataFrame, y_train, X_test : pd.DataFrame, n_folds : int, col : str, parameters = None, categorical_features = None): """ # Arguments: : src_dir - str - main dir for saving model,history and fold runs : X_train,y_train - training dataset with labels : X_test,y_test - test dataset with labels : n_folds - number of folds to split dataset : col - if self.stratify is True, you need to specified a col that consist of binary of multilabel classes because StratifiedKFold does not support continous values : parameters - run_parameters that are necessary to run the Tree-Based models, for better understading of these parameters, you should go and read LightGBM, XGBoost,CatBoost API : categorical_features - list - list of categorical features in dataset, necessary for LightGBM and CatBoost # Returns: valid_predictions,test_predictions - predictions made by model """ if isinstance(y_train, pd.DataFrame): y_train = y_train.values if src_dir: if os.path.isdir(src_dir): pass else: print(f"Making dir:{src_dir}") os.makedirs(src_dir) try: print("X_train_shape",X_train.shape) if X_test is not None: print("X_test_shape",X_test.shape) except ValueError: print("Shape does not fit") if self.stratify: print(f"Make {n_folds} stratified folds") kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state = self.seed) elif self.time_series: kf = TimeSeriesSplit(n_splits=n_folds,random_state = self.seed) else: kf = KFold(n_splits=n_folds,random_state = self.seed) valid_predictions = np.zeros((X_train.shape[0],n_folds)) print("Vaild_predict",valid_predictions.shape[0]) if X_test is not None: test_predictions = np.zeros((X_test.shape[0],n_folds)) print("Test_predict",test_predictions.shape[0]) i = 0 for train_index, val_index in kf.split(X_train,y_train) if self.stratify is False else kf.split(X_train,X_train[col]): if self.train_gbm: print("Train LightGBM") train_X = X_train.iloc[train_index] val_X = X_train.iloc[val_index] if isinstance(y_train,pd.DataFrame): train_y = y_train.iloc[train_index] val_y = y_train.iloc[val_index] else: train_y = y_train[train_index] val_y = y_train[val_index] lgb_train = lgb.Dataset(train_X,train_y,categorical_feature = categorical_features) lgb_val = lgb.Dataset(val_X,val_y,categorical_feature = categorical_features,reference = lgb_train) gbm = lgb.train(params = parameters, train_set=lgb_train, num_boost_round=parameters['num_boost_round'], valid_sets=[lgb_train,lgb_val], early_stopping_rounds=parameters['early_stopping_rounds'], evals_result = self.history, verbose_eval = parameters['verbose_eval'], feval = self.eval_metric) valid_predictions[val_index,i] = gbm.predict(val_X,num_iteration=gbm.best_iteration) valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None) r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])) log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))) print(f"R2 Score for current validation set:{r2}") print(f"RMSLE for current val set:{log_error}") if self.save_model: print("Saving model") gbm.save_model(f'{src_dir}/fold_{i}_{self.name}_eval_history.txt') if self.save_history: print("Saving Hisotry") pd.to_pickle(self.history,f'{src_dir}/fold_{i}_{self.name}_pickle_eval_history.pkl') if self.test_predict: test_predictions[:,i] = self.predict_test(X_test,i,src_dir) test_predictions[:,i] = np.clip(test_predictions[:,i],a_min=0,a_max=None) if self.importance: self.visualize_importance(i,src_dir) if self.show_metric_results: self.show_results(i) elif self.train_xg: print("Train XGBooost") train_X = np.nan_to_num(X_train.iloc[train_index]) val_X = np.nan_to_num(X_train.iloc[val_index]) if isinstance(y_train,pd.DataFrame): train_y = np.nan_to_num(y_train.iloc[train_index]) val_y = np.nan_to_num(y_train.iloc[val_index]) else: train_y = np.nan_to_num(y_train[train_index]) val_y = np.nan_to_num(y_train[val_index]) xg_train = xgb.DMatrix(train_X,label=train_y,feature_names=X_train.columns) xg_val = xgb.DMatrix(val_X,label=val_y,feature_names=X_train.columns) eval_list = [(xg_train,'train'),(xg_val,'val')] xgboost_train = xgb.train(parameters,xg_train,evals=eval_list, evals_result=self.history, num_boost_round=parameters['boost_round'], early_stopping_rounds=parameters['early_stopping'], verbose_eval = parameters['verbose_eval']) valid_predictions[val_index,i] = xgboost_train.predict(xg_val, ntree_limit = xgboost_train.best_ntree_limit) valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None) r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])) log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))) print(f"R2 Score for current validation set:{r2}") print(f"RMSLE for current val set:{log_error}") if self.save_model: print("Saving model") xgboost_train.save_model(f'{src_dir}/fold_{i}_{self.name}_eval_history.txt') if self.save_history: print("Saving Hisotry") pd.to_pickle(self.history,f'{src_dir}/fold_{i}_{self.name}_pickle_eval_history.pkl') if self.test_predict: test_predictions[:,i] = self.predict_test(X_test,i,src_dir,xgboost_train) test_predictions[:,i] = np.clip(test_predictions[:,i],a_min=0,a_max=None) if self.importance: self.visualize_importance(i,src_dir) if self.show_metric_results: self.show_results(i) elif self.train_cat: print("Training CatBoost") train_X = np.array(X_train.iloc[train_index],dtype=np.float32) val_X = np.array(X_train.iloc[val_index],dtype=np.float32) if isinstance(y_train,pd.DataFrame): train_y = np.array(y_train.iloc[train_index],dtype=np.float32) val_y = np.array(y_train.iloc[val_index],dtype=np.float32) else: train_y = np.array(y_train[train_index],dtype=np.float32) val_y = np.array(y_train[val_index],dtype=np.float32) cat_train = catboost.Pool(train_X,label=train_y) cat_test = catboost.Pool(val_X,label=val_y) self.cat = catboost.CatBoostRegressor(**parameters).fit(cat_train,use_best_model=True, eval_set=cat_test,verbose_eval=True) self.history = self.cat.get_evals_result() #Index Error after first epoch, need to fix it valid_predictions[val_index,i] = self.cat.predict(cat_test) valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None) r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])) log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))) print(f"R2 Score for current validation set:{r2}") print(f"RMSLE for current val set:{log_error}") if self.save_model: print("Saving model") self.cat.save_model(f'{src_dir}/fold_{i}_{self.name}_eval_history',format='json') if self.test_predict: test_predictions[:,i] = self.predict_test(X_test,i,src_dir) test_predictions[:,i] = np.clip(test_predictions[:,i],a_min=0,a_max=None) elif self.train_ng: print("Train NGBooost") train_X = np.nan_to_num(X_train.iloc[train_index]) val_X = np.nan_to_num(X_train.iloc[val_index]) if isinstance(y_train,pd.DataFrame): train_y = np.nan_to_num(y_train.iloc[train_index]) val_y = np.nan_to_num(y_train.iloc[val_index]) else: train_y = np.nan_to_num(y_train[train_index]) val_y = np.nan_to_num(y_train[val_index]) ng = NGBoost(Dist=Normal,Score=MLE, Base=default_tree_learner,natural_gradient=True, n_estimators = 150,learning_rate = 0.01,verbose=True, verbose_eval=50).fit(train_X,train_y) valid_predictions[val_index,i] = ng.predict(val_X) valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None) rmse = np.sqrt(mean_squared_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))) r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])) log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))) print(f"RMSE for current fold:{rmse}") print(f"R2 Score for current fold:{r2}") print(f"RMSLE for current val set:{log_error}") test_predictions[:,i] = np.clip(ng.predict(X_test),a_min=0,a_max=None) i += 1 if self.jsonize: print("Saving model parameters to json") if os.path.isdir('parameters'): pass else: print("Making Dir: parameters") os.makedirs('parameters') model_dict = {"model":f"{src_dir}_{i}_folds", "parameters":parameters} with open(f"./parameters/{src_dir}_{i}_fold.json",'w+') as model_param: json.dump(model_dict,model_param) if self.prepare_submission: self.output_submission(self,test_predictions,i) return valid_predictions,test_predictions
'number_of_particles_in_this_jet', 'jet_px', 'jet_py', 'jet_pz', 'jet_energy', 'jet_mass' ]] # 随机切分数据集 X_train, X_test, Y_train, Y_test = train_test_split(features.values, train_data.label.values, test_size=0.2) ngb = NGBoost(Base=default_tree_learner, Dist=Normal, Score=MLE(), natural_gradient=True, verbose=False) # 拟合 ngb.fit(X_train, Y_train) # 预测 Y_preds = ngb.predict(X_test) test_data = pd.read_csv("jet_simple_data/simple_test_R04_jet.csv") features = test_data[[ 'number_of_particles_in_this_jet', 'jet_px', 'jet_py', 'jet_pz', 'jet_energy', 'jet_mass' ]] Y_test_data = ngb.predict(features) with open("submmission.csv", "") as f: f.write("id,label\n") for jet_id, label in zip(test_data['jet_id'], Y_test_data): f.write(jet_id + "," + label + "\n") Y_dists = ngb.pred_dist(X_test) # 检验均方误差 test Mean Squared Error test_MSE = mean_squared_error(Y_preds, Y_test) print('Test MSE', test_MSE) # 检验负对数似然test Negative Log Likelihood