def test_classification(self): data, target = load_breast_cancer(True) x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42) ngb = NGBoost(Base=default_tree_learner, Dist=Bernoulli, Score=MLE, verbose=False) ngb.fit(x_train, y_train) preds = ngb.pred_dist(x_test) score = roc_auc_score(y_test, preds.prob) assert score >= 0.95
def test_regression(self): data, target = load_boston(True) x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42) ngb = NGBoost(Base=default_tree_learner, Dist=Normal, Score=MLE, natural_gradient=True, verbose=False) ngb.fit(x_train, y_train) preds = ngb.predict(x_test) score = mean_squared_error(y_test, preds) assert score <= 8.0
def ngb_impute(estimator, X, Y): base_name_to_learner = { "tree": default_tree_learner, "linear": default_linear_learner, } ngb = NGBoost(Dist=estimator, n_estimators=200, learning_rate=.05, natural_gradient=True, verbose=False, minibatch_frac=1.0, Base=base_name_to_learner[LEARNER], Score=MLE) train = ngb.fit(X, Y) Y_imputed = np.copy(Y) cens_mask = (Y['Event'] == 0) min_vals = Y['Time'][cens_mask] pred_dists = train.pred_dist(X[cens_mask]) try: outputs = pred_dists.loc[:, 0] except IndexError: outputs = pred_dists.loc # mus = pred_dists.loc # sigmas = pred_dists.scale # preds = cond_expectation(estimator, mus, sigmas, min_vals) # print(np.sum(cens_mask)) # print(min_vals.shape, preds.shape) # print(min_vals) # print(preds) # print(min_vals[:10]) # print(np.exp(pred_dists.loc)[:10]) # print(pred_dists.mean()[:10]) Y_imputed['Time'][cens_mask] = np.exp(outputs) return Y_imputed
folds.append( (train_index, test_index) ) #breakpoint() for itr, (train_index, test_index) in enumerate(folds): X_trainall, X_test = X[train_index], X[test_index] y_trainall, y_test = y[train_index], y[test_index] X_train, X_val, y_train, y_val = train_test_split(X_trainall, y_trainall, test_size=0.2) y_true += list(y_test.flatten()) ngb = NGBoost(Base=base_name_to_learner[args.base], Dist=eval(args.distn), Score=score_name_to_score[args.score](64), n_estimators=args.n_est, learning_rate=args.lr, natural_gradient=args.natural, minibatch_frac=args.minibatch_frac, verbose=args.verbose) train_loss, val_loss = ngb.fit(X_train, y_train) #, X_val, y_val) y_preds = ngb.staged_predict(X_val) y_forecasts = ngb.staged_pred_dist(X_val) val_rmse = [mean_squared_error(y_pred, y_val) for y_pred in y_preds] val_nll = [-y_forecast.logpdf(y_val.flatten()).mean() for y_forecast in y_forecasts] best_itr = np.argmin(val_rmse) + 1 best_itr = np.argmin(val_nll) + 1 full_retrain = True if full_retrain:
from ngboost.ngboost import NGBoost from ngboost.distns import Bernoulli from ngboost.learners import default_tree_learner from ngboost.scores import MLE from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score if __name__ == "__main__": X, Y = load_breast_cancer(True) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) ngb = NGBoost(Base=default_tree_learner, Dist=Bernoulli, Score=MLE(), verbose=True) ngb.fit(X_train, Y_train) preds = ngb.pred_dist(X_test) print("ROC:", roc_auc_score(Y_test, preds.prob))
m, n = 1000, 5 X = np.random.randn(m, n) / np.sqrt(n) Y = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1)) T = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1)) + args.eps C = (T < Y).astype(int) print(X.shape, Y.shape, C.shape) print(f"Censorship: {np.mean(C):.2f}") X_tr, X_te, Y_tr, Y_te, T_tr, T_te, C_tr, C_te = train_test_split( X, Y, T, C, test_size=0.2) ngb = NGBoost(Dist=LogNormal, n_estimators=args.n_estimators, learning_rate=args.lr, natural_gradient=False, Base=default_linear_learner, Score=MLE()) train_losses = ngb.fit(X_tr, Y_join(np.exp(np.minimum(Y_tr, T_tr)), C_tr)) preds = ngb.pred_dist(X_te) print(f"R2: {r2_score(Y_te, np.log(preds.mean()))}") plt.hist(preds.mean(), range=(-5, 5), bins=30, alpha=0.5, label="Pred") plt.hist(Y_te, range=(-5, 5), bins=30, alpha=0.5, label="True") plt.legend() plt.show() # since we simulated the data we fully observe all outcomes pctles, observed, slope, intercept = calibration_regression(preds, Y_te) plot_calibration_curve(pctles, observed)
argparser.add_argument("--distn", type=str, default="Normal") argparser.add_argument("--natural", action="store_true") argparser.add_argument("--score", type=str, default="CRPS") args = argparser.parse_args() np.random.seed(123) m, n = 1200, 50 noise = np.random.randn(*(m, 1)) beta1 = np.random.randn(n, 1) X = np.random.randn(m, n) / np.sqrt(n) Y = X @ beta1 + args.noise_lvl * noise print(X.shape, Y.shape) X_train, X_test = X[:1000, :], X[1000:, ] Y_train, Y_test = Y[:1000], Y[1000:] ngb = NGBoost(n_estimators=400, learning_rate=args.lr, Dist=Normal, Base=default_linear_learner, natural_gradient=args.natural, minibatch_frac=1.0, Score=eval(args.score)(), verbose=True, verbose_eval=10) losses = ngb.fit(X_train, Y_train) forecast = ngb.pred_dist(X_test) print("R2:", r2_score(Y_test, forecast.loc))
m, n = 1000, 10 noise = sp.stats.laplace.rvs(size=(m, 1)) beta1 = np.random.randn(n, 1) beta2 = np.random.randn(n, 1) X = np.random.randn(m, n) / np.sqrt(n) # Y = X @ beta + 0.5 * noise Y = X @ beta1 + 0.5 * np.sqrt(np.exp(X @ beta2)) * noise print(X.shape, Y.shape) axis = np.linspace(0.0, 2, 200) plt.figure(figsize=(8, 3)) ngb = NGBoost(n_estimators=100, learning_rate=1.0, Dist=Normal, Base=default_linear_learner, natural_gradient=True, minibatch_frac=1.0, Score=CRPS()) ngb.fit(X, Y) preds = ngb.pred_dist(X) print(preds.scale.mean()) print(preds.scale.std()) pctles, observed, slope, intercept = calibration_regression(preds, Y) plt.subplot(1, 2, 1) plot_pit_histogram(pctles, observed, label="CRPS", linestyle="--") plt.subplot(1, 2, 2) plt.plot(axis, gaussian_kde(preds.scale)(axis), linestyle="--",
m, n = 1000, 5 X = np.random.randn(m, n) / np.sqrt(n) Y = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1)) T = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1)) + args.eps C = (T < Y).astype(int) print(X.shape, Y.shape, C.shape) print(f"Censorship: {np.mean(C):.2f}") X_tr, X_te, Y_tr, Y_te, T_tr, T_te, C_tr, C_te = train_test_split( X, Y, T, C, test_size=0.2) ngb = NGBoost(Dist=Laplace, n_estimators=args.n_estimators, learning_rate=args.lr, natural_gradient=False, Base=default_linear_learner, Score=MLE_SURV()) train_losses = ngb.fit(X_tr, np.c_[np.minimum(Y_tr, T_tr), C_tr]) preds = ngb.pred_dist(X_te) print(f"R2: {r2_score(Y_te, preds.loc)}") plt.hist(preds.loc, range=(-5, 5), bins=30, alpha=0.5, label="Pred") plt.hist(Y_te, range=(-5, 5), bins=30, alpha=0.5, label="True") plt.legend() plt.show() # since we simulated the data we fully observe all outcomes pctles, observed, slope, intercept = calibration_regression(preds, Y_te) plot_calibration_curve(pctles, observed)
m, n = 1000, 5 X = np.random.randn(m, n) / np.sqrt(n) Y = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1)) T = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1)) + args.eps C = (T < Y).astype(int) print(X.shape, Y.shape, C.shape) print(f"Censorship: {np.mean(C):.2f}") X_tr, X_te, Y_tr, Y_te, T_tr, T_te, C_tr, C_te = train_test_split( X, Y, T, C, test_size=0.2) ngb = NGBoost(Dist=Exponential, n_estimators=args.n_estimators, learning_rate=args.lr, natural_gradient=True, Base=default_linear_learner, Score=MLE, verbose=True, verbose_eval=1) train_losses = ngb.fit(X_tr, Y_join(np.exp(np.minimum(Y_tr, T_tr)), C_tr)) preds = ngb.pred_dist(X_te) print(f"R2: {r2_score(Y_te, np.log(preds.mean()))}") plt.hist(preds.mean(), range=(0, 10), bins=30, alpha=0.5, label="Pred") plt.hist(np.exp(Y_te), range=(0, 10), bins=30, alpha=0.5, label="True") plt.legend() plt.show() # since we simulated the data we fully observe all outcomes pctles, observed, slope, intercept = calibration_regression(preds, Y_te)
def fold_run(self, src_dir : str, X_train : pd.DataFrame, y_train, X_test : pd.DataFrame, n_folds : int, col : str, parameters = None, categorical_features = None): """ # Arguments: : src_dir - str - main dir for saving model,history and fold runs : X_train,y_train - training dataset with labels : X_test,y_test - test dataset with labels : n_folds - number of folds to split dataset : col - if self.stratify is True, you need to specified a col that consist of binary of multilabel classes because StratifiedKFold does not support continous values : parameters - run_parameters that are necessary to run the Tree-Based models, for better understading of these parameters, you should go and read LightGBM, XGBoost,CatBoost API : categorical_features - list - list of categorical features in dataset, necessary for LightGBM and CatBoost # Returns: valid_predictions,test_predictions - predictions made by model """ if isinstance(y_train, pd.DataFrame): y_train = y_train.values if src_dir: if os.path.isdir(src_dir): pass else: print(f"Making dir:{src_dir}") os.makedirs(src_dir) try: print("X_train_shape",X_train.shape) if X_test is not None: print("X_test_shape",X_test.shape) except ValueError: print("Shape does not fit") if self.stratify: print(f"Make {n_folds} stratified folds") kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state = self.seed) elif self.time_series: kf = TimeSeriesSplit(n_splits=n_folds,random_state = self.seed) else: kf = KFold(n_splits=n_folds,random_state = self.seed) valid_predictions = np.zeros((X_train.shape[0],n_folds)) print("Vaild_predict",valid_predictions.shape[0]) if X_test is not None: test_predictions = np.zeros((X_test.shape[0],n_folds)) print("Test_predict",test_predictions.shape[0]) i = 0 for train_index, val_index in kf.split(X_train,y_train) if self.stratify is False else kf.split(X_train,X_train[col]): if self.train_gbm: print("Train LightGBM") train_X = X_train.iloc[train_index] val_X = X_train.iloc[val_index] if isinstance(y_train,pd.DataFrame): train_y = y_train.iloc[train_index] val_y = y_train.iloc[val_index] else: train_y = y_train[train_index] val_y = y_train[val_index] lgb_train = lgb.Dataset(train_X,train_y,categorical_feature = categorical_features) lgb_val = lgb.Dataset(val_X,val_y,categorical_feature = categorical_features,reference = lgb_train) gbm = lgb.train(params = parameters, train_set=lgb_train, num_boost_round=parameters['num_boost_round'], valid_sets=[lgb_train,lgb_val], early_stopping_rounds=parameters['early_stopping_rounds'], evals_result = self.history, verbose_eval = parameters['verbose_eval'], feval = self.eval_metric) valid_predictions[val_index,i] = gbm.predict(val_X,num_iteration=gbm.best_iteration) valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None) r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])) log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))) print(f"R2 Score for current validation set:{r2}") print(f"RMSLE for current val set:{log_error}") if self.save_model: print("Saving model") gbm.save_model(f'{src_dir}/fold_{i}_{self.name}_eval_history.txt') if self.save_history: print("Saving Hisotry") pd.to_pickle(self.history,f'{src_dir}/fold_{i}_{self.name}_pickle_eval_history.pkl') if self.test_predict: test_predictions[:,i] = self.predict_test(X_test,i,src_dir) test_predictions[:,i] = np.clip(test_predictions[:,i],a_min=0,a_max=None) if self.importance: self.visualize_importance(i,src_dir) if self.show_metric_results: self.show_results(i) elif self.train_xg: print("Train XGBooost") train_X = np.nan_to_num(X_train.iloc[train_index]) val_X = np.nan_to_num(X_train.iloc[val_index]) if isinstance(y_train,pd.DataFrame): train_y = np.nan_to_num(y_train.iloc[train_index]) val_y = np.nan_to_num(y_train.iloc[val_index]) else: train_y = np.nan_to_num(y_train[train_index]) val_y = np.nan_to_num(y_train[val_index]) xg_train = xgb.DMatrix(train_X,label=train_y,feature_names=X_train.columns) xg_val = xgb.DMatrix(val_X,label=val_y,feature_names=X_train.columns) eval_list = [(xg_train,'train'),(xg_val,'val')] xgboost_train = xgb.train(parameters,xg_train,evals=eval_list, evals_result=self.history, num_boost_round=parameters['boost_round'], early_stopping_rounds=parameters['early_stopping'], verbose_eval = parameters['verbose_eval']) valid_predictions[val_index,i] = xgboost_train.predict(xg_val, ntree_limit = xgboost_train.best_ntree_limit) valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None) r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])) log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))) print(f"R2 Score for current validation set:{r2}") print(f"RMSLE for current val set:{log_error}") if self.save_model: print("Saving model") xgboost_train.save_model(f'{src_dir}/fold_{i}_{self.name}_eval_history.txt') if self.save_history: print("Saving Hisotry") pd.to_pickle(self.history,f'{src_dir}/fold_{i}_{self.name}_pickle_eval_history.pkl') if self.test_predict: test_predictions[:,i] = self.predict_test(X_test,i,src_dir,xgboost_train) test_predictions[:,i] = np.clip(test_predictions[:,i],a_min=0,a_max=None) if self.importance: self.visualize_importance(i,src_dir) if self.show_metric_results: self.show_results(i) elif self.train_cat: print("Training CatBoost") train_X = np.array(X_train.iloc[train_index],dtype=np.float32) val_X = np.array(X_train.iloc[val_index],dtype=np.float32) if isinstance(y_train,pd.DataFrame): train_y = np.array(y_train.iloc[train_index],dtype=np.float32) val_y = np.array(y_train.iloc[val_index],dtype=np.float32) else: train_y = np.array(y_train[train_index],dtype=np.float32) val_y = np.array(y_train[val_index],dtype=np.float32) cat_train = catboost.Pool(train_X,label=train_y) cat_test = catboost.Pool(val_X,label=val_y) self.cat = catboost.CatBoostRegressor(**parameters).fit(cat_train,use_best_model=True, eval_set=cat_test,verbose_eval=True) self.history = self.cat.get_evals_result() #Index Error after first epoch, need to fix it valid_predictions[val_index,i] = self.cat.predict(cat_test) valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None) r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])) log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))) print(f"R2 Score for current validation set:{r2}") print(f"RMSLE for current val set:{log_error}") if self.save_model: print("Saving model") self.cat.save_model(f'{src_dir}/fold_{i}_{self.name}_eval_history',format='json') if self.test_predict: test_predictions[:,i] = self.predict_test(X_test,i,src_dir) test_predictions[:,i] = np.clip(test_predictions[:,i],a_min=0,a_max=None) elif self.train_ng: print("Train NGBooost") train_X = np.nan_to_num(X_train.iloc[train_index]) val_X = np.nan_to_num(X_train.iloc[val_index]) if isinstance(y_train,pd.DataFrame): train_y = np.nan_to_num(y_train.iloc[train_index]) val_y = np.nan_to_num(y_train.iloc[val_index]) else: train_y = np.nan_to_num(y_train[train_index]) val_y = np.nan_to_num(y_train[val_index]) ng = NGBoost(Dist=Normal,Score=MLE, Base=default_tree_learner,natural_gradient=True, n_estimators = 150,learning_rate = 0.01,verbose=True, verbose_eval=50).fit(train_X,train_y) valid_predictions[val_index,i] = ng.predict(val_X) valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None) rmse = np.sqrt(mean_squared_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))) r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])) log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))) print(f"RMSE for current fold:{rmse}") print(f"R2 Score for current fold:{r2}") print(f"RMSLE for current val set:{log_error}") test_predictions[:,i] = np.clip(ng.predict(X_test),a_min=0,a_max=None) i += 1 if self.jsonize: print("Saving model parameters to json") if os.path.isdir('parameters'): pass else: print("Making Dir: parameters") os.makedirs('parameters') model_dict = {"model":f"{src_dir}_{i}_folds", "parameters":parameters} with open(f"./parameters/{src_dir}_{i}_fold.json",'w+') as model_param: json.dump(model_dict,model_param) if self.prepare_submission: self.output_submission(self,test_predictions,i) return valid_predictions,test_predictions
class CatBoostRegressor(iterations=None, learning_rate=None, depth=None, l2_leaf_reg=None, model_size_reg=None, rsm=None, loss_function='RMSE', border_count=None, feature_border_type=None, per_float_feature_quantization=None, input_borders=None, output_borders=None, fold_permutation_block=None, od_pval=None, od_wait=None, od_type=None, nan_mode=None, counter_calc_method=None, leaf_estimation_iterations=None, leaf_estimation_method=None, thread_count=None, random_seed=None, use_best_model=None, best_model_min_trees=None, verbose=None, silent=None, logging_level=None, metric_period=None, ctr_leaf_count_limit=None, store_all_simple_ctr=None, max_ctr_complexity=None, has_time=None, allow_const_label=None, one_hot_max_size=None, random_strength=None, name=None, ignored_features=None, train_dir=None, custom_metric=None, eval_metric=None, bagging_temperature=None, save_snapshot=None, snapshot_file=None, snapshot_interval=None, fold_len_multiplier=None, used_ram_limit=None, gpu_ram_part=None, pinned_memory_size=None, allow_writing_files=None, final_ctr_computation_mode=None, approx_on_full_history=None, boosting_type=None, simple_ctr=None, combinations_ctr=None, per_feature_ctr=None, ctr_target_border_count=None, task_type=None, device_config=None, devices=None, bootstrap_type=None, subsample=None, sampling_unit=None, dev_score_calc_obj_block_size=None, max_depth=None, n_estimators=None, num_boost_round=None, num_trees=None, colsample_bylevel=None, random_state=None, reg_lambda=None, objective=None, eta=None, max_bin=None, gpu_cat_features_storage=None, data_partition=None, metadata=None, early_stopping_rounds=None, cat_features=None, grow_policy=None, min_data_in_leaf=None, min_child_samples=None, max_leaves=None, num_leaves=None, score_function=None, leaf_estimation_backtracking=None, ctr_history_unit=None, monotone_constraints=None) ##------NGBoost---------- # import packages import pandas as pd from ngboost.ngboost import NGBoost from ngboost.learners import default_tree_learner from ngboost.distns import Normal import ngboost.scores from MLE import lightgbm as lgb import xgboost as xgb from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from math import sqrt # read the dataset df = pd.read_csv('~/train.csv') # feature engineering tr, te = Nanashi_solution(df) # NGBoost ngb = NGBoost(Base=default_tree_learner, Dist=Normal, Score=MLE(), natural_gradient=True,verbose=False) ngboost = ngb.fit(np.asarray(tr.drop(['SalePrice'],1)), np.asarray(tr.SalePrice)) y_pred_ngb = pd.DataFrame(ngb.predict(te.drop(['SalePrice'],1))) # LightGBM ltr = lgb.Dataset(tr.drop(['SalePrice'],1),label=tr['SalePrice']) param = { 'bagging_freq': 5, 'bagging_fraction': 0.6, 'bagging_seed': 123, 'boost_from_average':'false', 'boost': 'gbdt', 'feature_fraction': 0.3, 'learning_rate': .01, 'max_depth': 3, 'metric':'rmse', 'min_data_in_leaf': 128, 'min_sum_hessian_in_leaf': 8, 'num_leaves': 128, 'num_threads': 8, 'tree_learner': 'serial', 'objective': 'regression', 'verbosity': -1, 'random_state':123, 'max_bin': 8, 'early_stopping_round':100 } lgbm = lgb.train(param,ltr,num_boost_round=10000,valid_sets= [(ltr)],verbose_eval=1000) y_pred_lgb = lgbm.predict(te.drop(['SalePrice'],1)) y_pred_lgb = np.where(y_pred>=.25,1,0) # XGBoost params = { 'max_depth': 4, 'eta': 0.01, 'objective':'reg:squarederror', 'eval_metric': ['rmse'], 'booster':'gbtree', 'verbosity':0, 'sample_type':'weighted', 'max_delta_step':4, 'subsample':.5, 'min_child_weight':100, 'early_stopping_round':50 } dtr, dte = xgb.DMatrix(tr.drop(['SalePrice'],1),label=tr.SalePrice), xgb.DMatrix(te.drop(['SalePrice'],1),label=te.SalePrice) num_round = 5000 xgbst = xgb.train(params,dtr,num_round,verbose_eval=500) y_pred_xgb = xgbst.predict(dte) # Check the results print('RMSE: NGBoost', round(sqrt(mean_squared_error(X_val.SalePrice,y_pred_ngb)),4)) print('RMSE: LGBM', round(sqrt(mean_squared_error(X_val.SalePrice,y_pred_lgbm)),4)) print('RMSE: XGBoost', round(sqrt(mean_squared_error(X_val.SalePrice,y_pred_xgb)),4)) # see the probability distributions by visualising Y_dists = ngb.pred_dist(X_val.drop(['SalePrice'],1)) y_range = np.linspace(min(X_val.SalePrice), max(X_val.SalePrice), 200) dist_values = Y_dists.pdf(y_range).transpose() # plot index 0 and 114 idx = 114 plt.plot(y_range,dist_values[idx]) plt.title(f"idx: {idx}") plt.tight_layout() plt.show() '''
from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split # 获取高能对撞粒子数据集 train_data = pd.read_csv("jet_simple_data/simple_train_R04_jet.csv") # 获取特征列 features = train_data[[ 'number_of_particles_in_this_jet', 'jet_px', 'jet_py', 'jet_pz', 'jet_energy', 'jet_mass' ]] # 随机切分数据集 X_train, X_test, Y_train, Y_test = train_test_split(features.values, train_data.label.values, test_size=0.2) ngb = NGBoost(Base=default_tree_learner, Dist=Normal, Score=MLE(), natural_gradient=True, verbose=False) # 拟合 ngb.fit(X_train, Y_train) # 预测 Y_preds = ngb.predict(X_test) test_data = pd.read_csv("jet_simple_data/simple_test_R04_jet.csv") features = test_data[[ 'number_of_particles_in_this_jet', 'jet_px', 'jet_py', 'jet_pz', 'jet_energy', 'jet_mass' ]] Y_test_data = ngb.predict(features) with open("submmission.csv", "") as f: f.write("id,label\n") for jet_id, label in zip(test_data['jet_id'], Y_test_data):
print('== Dataset=%s X.shape=%s Censorship=%.4f' % (args.dataset, str(X.shape), np.mean(1 - E))) for itr in range(args.reps): X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2) ngb = NGBoost(Dist=eval(args.distn), n_estimators=args.n_est, learning_rate=args.lr, natural_gradient=args.natural, verbose=args.verbose, minibatch_frac=1.0, Base=base_name_to_learner[args.base], Score=eval(args.score)()) train_losses = ngb.fit(X_train, Y_train) #, X_val, Y_val) forecast = ngb.pred_dist(X_test) train_forecast = ngb.pred_dist(X_train) print('NGB score: %.4f (val), %.4f (train)' % (concordance_index_censored(Y_test['Event'], Y_test['Time'], -forecast.mean())[0], concordance_index_censored(Y_train['Event'], Y_train['Time'], -train_forecast.mean())[0])) #logger.tick(forecast, Y_test) ##
print("Models") start = datetime.now().timestamp() qreg = MLPQuantile() qreg.fit(X_train_std,y_train) preds = qreg.predict(X_test_std) end = datetime.now().timestamp() results=evaluate((np.exp(preds)-1),(np.exp(y_test)-1).values) results["duration"]=end-start save_result([horizon, "MLP", results, 1],f"unit_{horizon}",folder) start = datetime.now().timestamp() ngb = NGBoost(Base=default_tree_learner, Dist=Normal, Score=MLE(), natural_gradient=True, verbose=True,n_estimators=1500) ngb.fit(X_train_std, y_train.values) Y_dists = ngb.pred_dist(X_test_std) a=pd.DataFrame() for i in np.arange(1,100): a[i]=Y_dists.ppf(i/100) preds = a.values end = datetime.now().timestamp() results=evaluate((np.exp(preds)-1),(np.exp(y_test)-1).values) results["duration"]=end-start save_result([horizon, "NGBOOST", results, 1],f"unit_{horizon}",folder)
m, n = 1000, 50 if args.noise_dist == "Normal": noise = np.random.randn(*(m, 1)) elif args.noise_dist == "Laplace": noise = sp.stats.laplace.rvs(size=(m, 1)) beta = np.random.randn(n, 1) X = np.random.randn(m, n) / np.sqrt(n) Y = np.exp(X @ beta + 0.5 * noise) print(X.shape, Y.shape) dist = eval("Log" + args.dist) ngb = NGBoost(n_estimators=50, learning_rate=0.5, Dist=dist, Base=default_linear_learner, natural_gradient=False, minibatch_frac=1.0, Score=CRPS()) losses = ngb.fit(X, Y) preds = ngb.pred_dist(X) print(f"R2: {r2_score(Y, np.exp(preds.loc)):.4f}") pctles, observed, slope, intercept = calibration_regression(preds, Y) plt.figure(figsize=(8, 3)) plt.subplot(1, 2, 1) plot_pit_histogram(pctles, observed) plt.title("Original scale")
argparser.add_argument("--noise-dist", type=str, default="Normal") args = argparser.parse_args() m, n = 1000, 50 if args.noise_dist == "Normal": noise = np.random.randn(*(m, 1)) elif args.noise_dist == "Laplace": noise = sp.stats.laplace.rvs(size=(m, 1)) beta = np.random.randn(n, 1) X = np.random.randn(m, n) / np.sqrt(n) Y = X @ beta + 0.5 * noise + 20 print(X.shape, Y.shape) ngb = NGBoost(n_estimators=100, learning_rate=1., Dist=eval(args.dist), Base=default_linear_learner, natural_gradient=True, minibatch_frac=1.0, Score=MLE()) ngb.fit(X, Y) preds = ngb.pred_dist(X) print(f"R2: {r2_score(Y, preds.loc):.4f}") pctles, observed, slope, intercept = calibration_regression(preds, Y) print(observed) plt.figure(figsize = (8, 3)) plt.subplot(1, 2, 1) plot_calibration_curve(pctles, observed) plt.subplot(1, 2, 2) plot_pit_histogram(pctles, observed) plt.tight_layout()
argparser.add_argument("--n-estimators", type=int, default=301) argparser.add_argument("--lr", type=float, default=0.03) argparser.add_argument("--minibatch-frac", type=float, default=0.1) argparser.add_argument("--natural", action="store_true") args = argparser.parse_args() x_tr, y_tr, _ = gen_data(n=50) poly_transform = PolynomialFeatures(1) x_tr = poly_transform.fit_transform(x_tr) ngb = NGBoost( Base=default_tree_learner, Dist=Normal, Score=MLE, n_estimators=args.n_estimators, learning_rate=args.lr, natural_gradient=args.natural, minibatch_frac=args.minibatch_frac, verbose=True, ) ngb.fit(x_tr, y_tr) x_te, y_te, _ = gen_data(n=1000, bound=1.3) x_te = poly_transform.transform(x_te) preds = ngb.pred_dist(x_te) pctles, obs, _, _ = calibration_regression(preds, y_te) all_preds = ngb.staged_pred_dist(x_te) preds = all_preds[-1]
argparser.add_argument("--noise-lvl", type=float, default=0.25) argparser.add_argument("--distn", type=str, default="Normal") argparser.add_argument("--natural", action="store_true") argparser.add_argument("--score", type=str, default="CRPS") args = argparser.parse_args() m, n = 1200, 50 noise = np.random.randn(*(m, 1)) beta1 = np.random.randn(n, 1) beta2 = np.random.randn(n, 1) X = np.random.randn(m, n) / np.sqrt(n) Y = X @ beta1 + args.noise_lvl * np.sqrt(np.exp(X @ beta2)) * noise print(X.shape, Y.shape) X_train, X_test = X[:1000, :], X[1000:, ] Y_train, Y_test = Y[:1000], Y[1000:] ngb = NGBoost(n_estimators=150, learning_rate=args.lr, Dist=Laplace, Base=default_linear_learner, natural_gradient=args.natural, minibatch_frac=1.0, Score=eval(args.score)()) losses = ngb.fit(X_train, Y_train) forecast = ngb.pred_dist(X_test) logger = RegressionLogger(args) logger.tick(forecast, Y_test) logger.save()