def test_classification(self): data, target = load_breast_cancer(True) x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42) ngb = NGBoost(Base=default_tree_learner, Dist=Bernoulli, Score=MLE, verbose=False) ngb.fit(x_train, y_train) preds = ngb.pred_dist(x_test) score = roc_auc_score(y_test, preds.prob) assert score >= 0.95
def test_regression(self): data, target = load_boston(True) x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42) ngb = NGBoost(Base=default_tree_learner, Dist=Normal, Score=MLE, natural_gradient=True, verbose=False) ngb.fit(x_train, y_train) preds = ngb.predict(x_test) score = mean_squared_error(y_test, preds) assert score <= 8.0
def ngb_impute(estimator, X, Y): base_name_to_learner = { "tree": default_tree_learner, "linear": default_linear_learner, } ngb = NGBoost(Dist=estimator, n_estimators=200, learning_rate=.05, natural_gradient=True, verbose=False, minibatch_frac=1.0, Base=base_name_to_learner[LEARNER], Score=MLE) train = ngb.fit(X, Y) Y_imputed = np.copy(Y) cens_mask = (Y['Event'] == 0) min_vals = Y['Time'][cens_mask] pred_dists = train.pred_dist(X[cens_mask]) try: outputs = pred_dists.loc[:, 0] except IndexError: outputs = pred_dists.loc # mus = pred_dists.loc # sigmas = pred_dists.scale # preds = cond_expectation(estimator, mus, sigmas, min_vals) # print(np.sum(cens_mask)) # print(min_vals.shape, preds.shape) # print(min_vals) # print(preds) # print(min_vals[:10]) # print(np.exp(pred_dists.loc)[:10]) # print(pred_dists.mean()[:10]) Y_imputed['Time'][cens_mask] = np.exp(outputs) return Y_imputed
poly_transform = PolynomialFeatures(1) x_tr = poly_transform.fit_transform(x_tr) ngb = NGBoost( Base=default_tree_learner, Dist=Normal, Score=MLE, n_estimators=args.n_estimators, learning_rate=args.lr, natural_gradient=args.natural, minibatch_frac=args.minibatch_frac, verbose=True, ) ngb.fit(x_tr, y_tr) x_te, y_te, _ = gen_data(n=1000, bound=1.3) x_te = poly_transform.transform(x_te) preds = ngb.pred_dist(x_te) pctles, obs, _, _ = calibration_regression(preds, y_te) all_preds = ngb.staged_pred_dist(x_te) preds = all_preds[-1] plt.figure(figsize=(6, 3)) plt.scatter(x_tr[:, 1], y_tr, color="black", marker=".", alpha=0.5) plt.plot( x_te[:, 1], preds.loc, color="black",
X_train, X_val, y_train, y_val = train_test_split(X_trainall, y_trainall, test_size=0.2) y_true += list(y_test.flatten()) ngb = NGBoost(Base=base_name_to_learner[args.base], Dist=eval(args.distn), Score=score_name_to_score[args.score](64), n_estimators=args.n_est, learning_rate=args.lr, natural_gradient=args.natural, minibatch_frac=args.minibatch_frac, verbose=args.verbose) train_loss, val_loss = ngb.fit(X_train, y_train) #, X_val, y_val) y_preds = ngb.staged_predict(X_val) y_forecasts = ngb.staged_pred_dist(X_val) val_rmse = [mean_squared_error(y_pred, y_val) for y_pred in y_preds] val_nll = [-y_forecast.logpdf(y_val.flatten()).mean() for y_forecast in y_forecasts] best_itr = np.argmin(val_rmse) + 1 best_itr = np.argmin(val_nll) + 1 full_retrain = True if full_retrain: ngb = NGBoost(Base=base_name_to_learner[args.base], Dist=eval(args.distn), Score=score_name_to_score[args.score](64), n_estimators=args.n_est, learning_rate=args.lr,
from ngboost.ngboost import NGBoost from ngboost.distns import Bernoulli from ngboost.learners import default_tree_learner from ngboost.scores import MLE from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score if __name__ == "__main__": X, Y = load_breast_cancer(True) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) ngb = NGBoost(Base=default_tree_learner, Dist=Bernoulli, Score=MLE(), verbose=True) ngb.fit(X_train, Y_train) preds = ngb.pred_dist(X_test) print("ROC:", roc_auc_score(Y_test, preds.prob))
T = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1)) + args.eps C = (T < Y).astype(int) print(X.shape, Y.shape, C.shape) print(f"Censorship: {np.mean(C):.2f}") X_tr, X_te, Y_tr, Y_te, T_tr, T_te, C_tr, C_te = train_test_split( X, Y, T, C, test_size=0.2) ngb = NGBoost(Dist=LogNormal, n_estimators=args.n_estimators, learning_rate=args.lr, natural_gradient=False, Base=default_linear_learner, Score=MLE()) train_losses = ngb.fit(X_tr, Y_join(np.exp(np.minimum(Y_tr, T_tr)), C_tr)) preds = ngb.pred_dist(X_te) print(f"R2: {r2_score(Y_te, np.log(preds.mean()))}") plt.hist(preds.mean(), range=(-5, 5), bins=30, alpha=0.5, label="Pred") plt.hist(Y_te, range=(-5, 5), bins=30, alpha=0.5, label="True") plt.legend() plt.show() # since we simulated the data we fully observe all outcomes pctles, observed, slope, intercept = calibration_regression(preds, Y_te) plot_calibration_curve(pctles, observed) plt.show() pctles, observed, slope, intercept = calibration_time_to_event(
argparser.add_argument("--distn", type=str, default="Normal") argparser.add_argument("--natural", action="store_true") argparser.add_argument("--score", type=str, default="CRPS") args = argparser.parse_args() np.random.seed(123) m, n = 1200, 50 noise = np.random.randn(*(m, 1)) beta1 = np.random.randn(n, 1) X = np.random.randn(m, n) / np.sqrt(n) Y = X @ beta1 + args.noise_lvl * noise print(X.shape, Y.shape) X_train, X_test = X[:1000, :], X[1000:, ] Y_train, Y_test = Y[:1000], Y[1000:] ngb = NGBoost(n_estimators=400, learning_rate=args.lr, Dist=Normal, Base=default_linear_learner, natural_gradient=args.natural, minibatch_frac=1.0, Score=eval(args.score)(), verbose=True, verbose_eval=10) losses = ngb.fit(X_train, Y_train) forecast = ngb.pred_dist(X_test) print("R2:", r2_score(Y_test, forecast.loc))
X = np.random.randn(m, n) / np.sqrt(n) # Y = X @ beta + 0.5 * noise Y = X @ beta1 + 0.5 * np.sqrt(np.exp(X @ beta2)) * noise print(X.shape, Y.shape) axis = np.linspace(0.0, 2, 200) plt.figure(figsize=(8, 3)) ngb = NGBoost(n_estimators=100, learning_rate=1.0, Dist=Normal, Base=default_linear_learner, natural_gradient=True, minibatch_frac=1.0, Score=CRPS()) ngb.fit(X, Y) preds = ngb.pred_dist(X) print(preds.scale.mean()) print(preds.scale.std()) pctles, observed, slope, intercept = calibration_regression(preds, Y) plt.subplot(1, 2, 1) plot_pit_histogram(pctles, observed, label="CRPS", linestyle="--") plt.subplot(1, 2, 2) plt.plot(axis, gaussian_kde(preds.scale)(axis), linestyle="--", color="black", label="CRPS") ngb = NGBoost(n_estimators=100,
y_trainall, test_size=0.2) y_true += list(y_test.flatten()) ngb = NGBoost(Base=base_name_to_learner[args.base], Dist=eval(args.distn), Score=score_name_to_score[args.score], n_estimators=args.n_est, learning_rate=args.lr, natural_gradient=args.natural, minibatch_frac=args.minibatch_frac, verbose=args.verbose) #train_loss, val_loss ngb.fit(X_train, y_train) #, X_val, y_val) y_preds = ngb.staged_predict(X_val) y_forecasts = ngb.staged_pred_dist(X_val) val_rmse = [mean_squared_error(y_pred, y_val) for y_pred in y_preds] val_nll = [ -y_forecast.logpdf(y_val.flatten()).mean() for y_forecast in y_forecasts ] best_itr = np.argmin(val_rmse) + 1 best_itr = np.argmin(val_nll) + 1 full_retrain = True if full_retrain: ngb = NGBoost(Base=base_name_to_learner[args.base], Dist=eval(args.distn),
x_tr, y_tr, _ = gen_data(n=50) poly_transform = PolynomialFeatures(1) x_tr = poly_transform.fit_transform(x_tr) ngb = NGBoost(Base=default_tree_learner, Dist=Normal, Score=MLE(), n_estimators=args.n_estimators, learning_rate=args.lr, natural_gradient=args.natural, minibatch_frac=args.minibatch_frac, verbose=True) train_loss, val_loss = ngb.fit(x_tr, y_tr) x_te, y_te, _ = gen_data(n=1000, bound=1.3) x_te = poly_transform.transform(x_te) preds = ngb.pred_dist(x_te) pctles, obs, _, _ = calibration_regression(preds, y_te) all_preds = ngb.staged_pred_dist(x_te) preds = all_preds[-1] plt.figure(figsize=(6, 3)) plt.scatter(x_tr[:, 1], y_tr, color="black", marker=".", alpha=0.5) plt.plot(x_te[:, 1], preds.loc, color="black", linestyle="-",
Y, test_size=0.2) X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2) ngb = NGBoost(Dist=eval(args.distn), n_estimators=args.n_est, learning_rate=args.lr, natural_gradient=args.natural, verbose=args.verbose, minibatch_frac=1.0, Base=base_name_to_learner[args.base], Score=eval(args.score)()) train_losses = ngb.fit(X_train, Y_train) #, X_val, Y_val) forecast = ngb.pred_dist(X_test) train_forecast = ngb.pred_dist(X_train) print('NGB score: %.4f (val), %.4f (train)' % (concordance_index_censored(Y_test['Event'], Y_test['Time'], -forecast.mean())[0], concordance_index_censored(Y_train['Event'], Y_train['Time'], -train_forecast.mean())[0])) #logger.tick(forecast, Y_test) ## ## sksurv ## gbsa = GBSA(n_estimators=args.n_est, learning_rate=args.lr, subsample=args.minibatch_frac,
start = datetime.now().timestamp() qreg = MLPQuantile() qreg.fit(X_train_std,y_train) preds = qreg.predict(X_test_std) end = datetime.now().timestamp() results=evaluate((np.exp(preds)-1),(np.exp(y_test)-1).values) results["duration"]=end-start save_result([horizon, "MLP", results, 1],f"unit_{horizon}",folder) start = datetime.now().timestamp() ngb = NGBoost(Base=default_tree_learner, Dist=Normal, Score=MLE(), natural_gradient=True, verbose=True,n_estimators=1500) ngb.fit(X_train_std, y_train.values) Y_dists = ngb.pred_dist(X_test_std) a=pd.DataFrame() for i in np.arange(1,100): a[i]=Y_dists.ppf(i/100) preds = a.values end = datetime.now().timestamp() results=evaluate((np.exp(preds)-1),(np.exp(y_test)-1).values) results["duration"]=end-start save_result([horizon, "NGBOOST", results, 1],f"unit_{horizon}",folder)
noise = sp.stats.laplace.rvs(size=(m, 1)) beta = np.random.randn(n, 1) X = np.random.randn(m, n) / np.sqrt(n) Y = np.exp(X @ beta + 0.5 * noise) print(X.shape, Y.shape) dist = eval("Log" + args.dist) ngb = NGBoost(n_estimators=50, learning_rate=0.5, Dist=dist, Base=default_linear_learner, natural_gradient=False, minibatch_frac=1.0, Score=CRPS()) losses = ngb.fit(X, Y) preds = ngb.pred_dist(X) print(f"R2: {r2_score(Y, np.exp(preds.loc)):.4f}") pctles, observed, slope, intercept = calibration_regression(preds, Y) plt.figure(figsize=(8, 3)) plt.subplot(1, 2, 1) plot_pit_histogram(pctles, observed) plt.title("Original scale") Y = np.log(Y) dist = eval(args.dist) ngb = NGBoost(n_estimators=50,
T = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1)) + args.eps C = (T < Y).astype(int) print(X.shape, Y.shape, C.shape) print(f"Censorship: {np.mean(C):.2f}") X_tr, X_te, Y_tr, Y_te, T_tr, T_te, C_tr, C_te = train_test_split( X, Y, T, C, test_size=0.2) ngb = NGBoost(Dist=Laplace, n_estimators=args.n_estimators, learning_rate=args.lr, natural_gradient=False, Base=default_linear_learner, Score=MLE_SURV()) train_losses = ngb.fit(X_tr, np.c_[np.minimum(Y_tr, T_tr), C_tr]) preds = ngb.pred_dist(X_te) print(f"R2: {r2_score(Y_te, preds.loc)}") plt.hist(preds.loc, range=(-5, 5), bins=30, alpha=0.5, label="Pred") plt.hist(Y_te, range=(-5, 5), bins=30, alpha=0.5, label="True") plt.legend() plt.show() # since we simulated the data we fully observe all outcomes pctles, observed, slope, intercept = calibration_regression(preds, Y_te) plot_calibration_curve(pctles, observed) print(f"== Mean SD: {preds.scale.mean()}") plt.show()