def test_squared_loss(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0) model.fit(whas500_data.x, whas500_data.y) time_predicted = model.predict(whas500_data.x) time_true = whas500_data.y["lenfol"] event_true = whas500_data.y["fstat"] rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted)) assert round(abs(rmse_all - 580.23345259002951), 7) == 0 rmse_uncensored = numpy.sqrt( mean_squared_error(time_true[event_true], time_predicted[event_true])) assert round(abs(rmse_uncensored - 383.10639243317951), 7) == 0 cindex = model.score(whas500_data.x, whas500_data.y) assert round(abs(cindex - 0.9021810004), 7) == 0 with pytest.raises( ValueError, match="`fit` must be called with the loss option set to 'coxph'" ): model.predict_survival_function(whas500_data.x) with pytest.raises( ValueError, match="`fit` must be called with the loss option set to 'coxph'" ): model.predict_cumulative_hazard_function(whas500_data.x)
def fit_and_score_features(X, y): n_features = X.shape[1] scores = np.empty(n_features) m = GradientBoostingSurvivalAnalysis(verbose=True, n_estimators=500) for j in range(n_features): Xj = X[:, j:j + 1] m.fit(Xj, y) scores[j] = m.score(Xj, y) return scores
def test_squared_loss(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0) model.fit(whas500_data.x, whas500_data.y) time_predicted = model.predict(whas500_data.x) time_true = whas500_data.y["lenfol"] event_true = whas500_data.y["fstat"] rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted)) assert round(abs(rmse_all - 580.23345259002951), 7) == 0 rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true])) assert round(abs(rmse_uncensored - 383.10639243317951), 7) == 0 cindex = model.score(whas500_data.x, whas500_data.y) assert round(abs(cindex - 0.9021810004), 7) == 0
def test_ipcwls_loss(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(loss="ipcwls", n_estimators=100, max_depth=3, random_state=0) model.fit(whas500_data.x, whas500_data.y) time_predicted = model.predict(whas500_data.x) time_true = whas500_data.y["lenfol"] event_true = whas500_data.y["fstat"] rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted)) assert round(abs(rmse_all - 590.5441693629117), 7) == 0 rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true])) assert round(abs(rmse_uncensored - 392.97741487479743), 7) == 0 cindex = model.score(whas500_data.x, whas500_data.y) assert round(abs(cindex - 0.8979161399), 7) == 0
verbose=args.verbose, minibatch_frac=1.0, Base=base_name_to_learner[args.base], Score=eval(args.score), ) train_losses = ngb.fit(X_train, Y_train, E_train) forecast = ngb.pred_dist(X_test) train_forecast = ngb.pred_dist(X_train) print("NGB score: %.4f (val), %.4f (train)" % ( concordance_index_censored(E_test.astype(bool), Y_test, -forecast.mean())[0], concordance_index_censored(E_train.astype(bool), Y_train, -train_forecast.mean())[0], )) ## ## sksurv ## gbsa = GBSA( n_estimators=args.n_est, learning_rate=args.lr, subsample=args.minibatch_frac, verbose=args.verbose, ) gbsa.fit(X_train, Y_join(Y_train, E_train)) print("GBSA score: %.4f (val), %.4f (train)" % ( gbsa.score(X_test, Y_join(Y_test, E_test)), gbsa.score(X_train, Y_join(Y_train, E_train)), ))
X_train, X_val, Y_train, Y_val, E_train, E_val = train_test_split(X_train, Y_train, E_train, test_size=0.2) ngb = NGBSurvival(Dist=eval(args.distn), n_estimators=args.n_est, learning_rate=args.lr, natural_gradient=args.natural, verbose=args.verbose, minibatch_frac=1.0, Base=base_name_to_learner[args.base], Score=eval(args.score)) train_losses = ngb.fit(X_train, Y_train, E_train) forecast = ngb.pred_dist(X_test) train_forecast = ngb.pred_dist(X_train) print('NGB score: %.4f (val), %.4f (train)' % (concordance_index_censored(E_test.astype(bool), Y_test, -forecast.mean())[0], concordance_index_censored(E_train.astype(bool), Y_train, -train_forecast.mean())[0] )) ## ## sksurv ## gbsa = GBSA(n_estimators=args.n_est, learning_rate=args.lr, subsample=args.minibatch_frac, verbose=args.verbose) gbsa.fit(X_train, Y_join(Y_train, E_train)) print('GBSA score: %.4f (val), %.4f (train)' % (gbsa.score(X_test, Y_join(Y_test, E_test)), gbsa.score(X_train, Y_join(Y_train, E_train))))
n_estimators=args.n_est, learning_rate=args.lr, natural_gradient=args.natural, verbose=args.verbose, minibatch_frac=1.0, Base=base_name_to_learner[args.base], Score=eval(args.score)()) train_losses = ngb.fit(X_train, Y_train) #, X_val, Y_val) forecast = ngb.pred_dist(X_test) train_forecast = ngb.pred_dist(X_train) print('NGB score: %.4f (val), %.4f (train)' % (concordance_index_censored(Y_test['Event'], Y_test['Time'], -forecast.mean())[0], concordance_index_censored(Y_train['Event'], Y_train['Time'], -train_forecast.mean())[0])) #logger.tick(forecast, Y_test) ## ## sksurv ## gbsa = GBSA(n_estimators=args.n_est, learning_rate=args.lr, subsample=args.minibatch_frac, verbose=args.verbose) gbsa.fit(X_train, Y_train) print('GBSA score: %.4f (val), %.4f (train)' % (gbsa.score(X_test, Y_test), gbsa.score(X_train, Y_train))) #logger.save()
df2['T'] = T X, y = get_x_y(df2, ['E', 'T'], pos_label=True) for c in X.columns.values: if c != 'AGE AT DOC': X[c] = X[c].astype('category') data_x_numeric = OneHotEncoder().fit_transform(X) #%% estimator = GradientBoostingSurvivalAnalysis(verbose=True, n_estimators=500) estimator.fit(data_x_numeric, y) print(estimator.score(data_x_numeric, y)) print() scores = fit_and_score_features(data_x_numeric.values, y) print( pd.Series(scores, index=data_x_numeric.columns).sort_values(ascending=False)) pickle.dump(estimator, open('GradientRegressor.pkl', 'wb')) #%% from sklearn.feature_selection import SelectKBest from sklearn.pipeline import Pipeline pipe = Pipeline([('encode', OneHotEncoder()),