示例#1
0
                      n_estimators=args.n_est,
                      learning_rate=args.lr,
                      natural_gradient=args.natural,
                      verbose=args.verbose,
                      minibatch_frac=1.0,
                      Base=base_name_to_learner[args.base],
                      Score=eval(args.score)())

        train_losses = ngb.fit(X_train, Y_train)  #, X_val, Y_val)
        forecast = ngb.pred_dist(X_test)
        train_forecast = ngb.pred_dist(X_train)
        print('NGB score: %.4f (val), %.4f (train)' %
              (concordance_index_censored(Y_test['Event'], Y_test['Time'],
                                          -forecast.mean())[0],
               concordance_index_censored(Y_train['Event'], Y_train['Time'],
                                          -train_forecast.mean())[0]))
        #logger.tick(forecast, Y_test)

        ##
        ## sksurv
        ##
        gbsa = GBSA(n_estimators=args.n_est,
                    learning_rate=args.lr,
                    subsample=args.minibatch_frac,
                    verbose=args.verbose)
        gbsa.fit(X_train, Y_train)
        print('GBSA score: %.4f (val), %.4f (train)' %
              (gbsa.score(X_test, Y_test), gbsa.score(X_train, Y_train)))

    #logger.save()
    def test_max_features(make_whas500):
        whas500_data = make_whas500(with_std=False, to_numeric=True)

        model = GradientBoostingSurvivalAnalysis(n_estimators=10,
                                                 max_features="auto",
                                                 max_depth=3,
                                                 random_state=0)
        model.fit(whas500_data.x, whas500_data.y)

        assert model.max_features_ == whas500_data.x.shape[1]

        model.set_params(max_features="sqrt")
        model.fit(whas500_data.x, whas500_data.y)
        assert round(
            abs(model.max_features_ -
                int(numpy.sqrt(whas500_data.x.shape[1]))), 7) == 0

        model.set_params(max_features="log2")
        model.fit(whas500_data.x, whas500_data.y)
        assert round(
            abs(model.max_features_ -
                int(numpy.log2(whas500_data.x.shape[1]))), 7) == 0

        model.set_params(max_features=0.25)
        model.fit(whas500_data.x, whas500_data.y)
        assert round(
            abs(model.max_features_ - int(0.25 * whas500_data.x.shape[1])),
            7) == 0

        model.set_params(max_features=5)
        model.fit(whas500_data.x, whas500_data.y)
        assert round(abs(model.max_features_ - 5), 7) == 0

        model.set_params(max_features=-1)
        with pytest.raises(ValueError,
                           match=r"max_features must be in \(0, n_features\]"):
            model.fit(whas500_data.x, whas500_data.y)

        model.set_params(max_features=-1.125)
        with pytest.raises(ValueError,
                           match=r"max_features must be in \(0, 1.0\]"):
            model.fit(whas500_data.x, whas500_data.y)

        model.set_params(max_features="fail_me")
        with pytest.raises(ValueError,
                           match="Invalid value for max_features: 'fail_me'. "
                           "Allowed string values are 'auto', 'sqrt' "
                           "or 'log2'"):
            model.fit(whas500_data.x, whas500_data.y)
    E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False)

    df2['E'] = E
    df2['T'] = T

    X, y = get_x_y(df2, ['E', 'T'], pos_label=True)

    for c in X.columns.values:
        if c != 'AGE AT DOC':
            X[c] = X[c].astype('category')

    data_x_numeric = OneHotEncoder().fit_transform(X)
    #%%

    estimator = GradientBoostingSurvivalAnalysis(verbose=True,
                                                 n_estimators=500)
    estimator.fit(data_x_numeric, y)

    print(estimator.score(data_x_numeric, y))
    print()

    scores = fit_and_score_features(data_x_numeric.values, y)
    print(
        pd.Series(scores,
                  index=data_x_numeric.columns).sort_values(ascending=False))

    pickle.dump(estimator, open('GradientRegressor.pkl', 'wb'))

    #%%

    from sklearn.feature_selection import SelectKBest
    def test_squared_loss_staged_predict(make_whas500):
        whas500_data = make_whas500(with_std=False, to_numeric=True)

        # Test whether staged decision function eventually gives
        # the same prediction.
        model = GradientBoostingSurvivalAnalysis(loss="squared",
                                                 n_estimators=100,
                                                 max_depth=3,
                                                 random_state=0)
        model.fit(whas500_data.x, whas500_data.y)

        y_pred = model.predict(whas500_data.x)

        # test if prediction for last stage equals ``predict``
        for y in model.staged_predict(whas500_data.x):
            assert y.shape == y_pred.shape

        assert_array_equal(y_pred, y)

        model.set_params(dropout_rate=0.03)
        model.fit(whas500_data.x, whas500_data.y)

        y_pred = model.predict(whas500_data.x)

        # test if prediction for last stage equals ``predict``
        for y in model.staged_predict(whas500_data.x):
            assert y.shape == y_pred.shape

        assert_array_equal(y_pred, y)
示例#5
0
    def test_squared_loss_staged_predict(self):
        # Test whether staged decision function eventually gives
        # the same prediction.
        model = GradientBoostingSurvivalAnalysis(loss="squared",
                                                 n_estimators=100,
                                                 max_depth=3,
                                                 random_state=0)
        model.fit(self.x, self.y)

        y_pred = model.predict(self.x)

        # test if prediction for last stage equals ``predict``
        for y in model.staged_predict(self.x):
            self.assertTupleEqual(y.shape, y_pred.shape)

        assert_array_equal(y_pred, y)

        model.set_params(dropout_rate=0.03)
        model.fit(self.x, self.y)

        y_pred = model.predict(self.x)

        # test if prediction for last stage equals ``predict``
        for y in model.staged_predict(self.x):
            self.assertTupleEqual(y.shape, y_pred.shape)

        assert_array_equal(y_pred, y)
    def test_fit_verbose(make_whas500):
        whas500_data = make_whas500(with_std=False, to_numeric=True)

        model = GradientBoostingSurvivalAnalysis(n_estimators=10, verbose=1, random_state=0)
        model.fit(whas500_data.x, whas500_data.y)
示例#7
0
 def test_fit_verbose(self):
     model = GradientBoostingSurvivalAnalysis(n_estimators=10,
                                              verbose=1,
                                              random_state=0)
     model.fit(self.x, self.y)
示例#8
0
    def test_max_features(self):
        model = GradientBoostingSurvivalAnalysis(n_estimators=10,
                                                 max_features="auto",
                                                 max_depth=3,
                                                 random_state=0)
        model.fit(self.x, self.y)

        self.assertEqual(model.max_features_, self.x.shape[1])

        model.set_params(max_features="sqrt")
        model.fit(self.x, self.y)
        self.assertAlmostEqual(model.max_features_,
                               int(numpy.sqrt(self.x.shape[1])))

        model.set_params(max_features="log2")
        model.fit(self.x, self.y)
        self.assertAlmostEqual(model.max_features_,
                               int(numpy.log2(self.x.shape[1])))

        model.set_params(max_features=0.25)
        model.fit(self.x, self.y)
        self.assertAlmostEqual(model.max_features_,
                               int(0.25 * self.x.shape[1]))

        model.set_params(max_features=5)
        model.fit(self.x, self.y)
        self.assertAlmostEqual(model.max_features_, 5)

        model.set_params(max_features=-1)
        self.assertRaisesRegex(ValueError,
                               r"max_features must be in \(0, n_features\]",
                               model.fit, self.x, self.y)

        model.set_params(max_features=-1.125)
        self.assertRaisesRegex(ValueError,
                               r"max_features must be in \(0, 1.0\]",
                               model.fit, self.x, self.y)

        model.set_params(max_features="fail_me")
        self.assertRaisesRegex(
            ValueError, "Invalid value for max_features: 'fail_me'. "
            "Allowed string values are 'auto', 'sqrt' "
            "or 'log2'", model.fit, self.x, self.y)
示例#9
0
        ngb = NGBoost(Dist=eval(args.distn),
                      n_estimators=args.n_est,
                      learning_rate=args.lr,
                      natural_gradient=args.natural,
                      verbose=args.verbose,
                      minibatch_frac=1.0,
                      Base=base_name_to_learner[args.base],
                      Score=eval(args.score))

        train_losses = ngb.fit(X_train, Y_train) #, X_val, Y_val)
        forecast = ngb.pred_dist(X_test)
        train_forecast = ngb.pred_dist(X_train)
        print('NGB score: %.4f (val), %.4f (train)' % (concordance_index_censored(Y_test['Event'], Y_test['Time'], -forecast.mean())[0],
                                                       concordance_index_censored(Y_train['Event'], Y_train['Time'], -train_forecast.mean())[0]
        ))
        #logger.tick(forecast, Y_test)

        ##
        ## sksurv
        ##
        gbsa = GBSA(n_estimators=args.n_est,
                    learning_rate=args.lr,
                    subsample=args.minibatch_frac,
                    verbose=args.verbose)
        gbsa.fit(X_train, Y_train)
        print('GBSA score: %.4f (val), %.4f (train)' % (gbsa.score(X_test, Y_test),
                                                        gbsa.score(X_train, Y_train)))


    #logger.save()
示例#10
0
            verbose=args.verbose,
            minibatch_frac=1.0,
            Base=base_name_to_learner[args.base],
            Score=eval(args.score),
        )

        train_losses = ngb.fit(X_train, Y_train, E_train)
        forecast = ngb.pred_dist(X_test)
        train_forecast = ngb.pred_dist(X_train)
        print("NGB score: %.4f (val), %.4f (train)" % (
            concordance_index_censored(E_test.astype(bool), Y_test,
                                       -forecast.mean())[0],
            concordance_index_censored(E_train.astype(bool), Y_train,
                                       -train_forecast.mean())[0],
        ))

        ##
        ## sksurv
        ##
        gbsa = GBSA(
            n_estimators=args.n_est,
            learning_rate=args.lr,
            subsample=args.minibatch_frac,
            verbose=args.verbose,
        )
        gbsa.fit(X_train, Y_join(Y_train, E_train))
        print("GBSA score: %.4f (val), %.4f (train)" % (
            gbsa.score(X_test, Y_join(Y_test, E_test)),
            gbsa.score(X_train, Y_join(Y_train, E_train)),
        ))
示例#11
0
def RandomGridSearchRFC_Fixed(X,Y,splits, model, survival):
    """
    This function looks for the best set o parameters for RFC method
    Input: 
        X: training set
        Y: labels of training set
        splits: cross validation splits, used to make sure the parameters are stable
    Output:
        clf.best_params_: dictionary with the parameters, to use: param_svm['kernel']
    """    
      

    start_svm = time.time()  
    
    if model == 'svm':
        clf = svm.SVC()

        tuned_parameters = {
        'C': ([0.01, 1, 10]),
         'kernel': (['rbf', 'linear']),
        # 'kernel': (['linear', 'rbf', 'sigmoid']),
        # 'degree': ([1,3,5,10]),
        # 'decision_function_shape' : (['ovo', 'ovr']),
        # 'cache_size': ([500,1000,1500,2000]),
        'shrinking': ([False, True]),
        # 'probability': ([False, True])
        }
    
    if model == 'cart':
        clf = tree.DecisionTreeClassifier()

        tuned_parameters = {
        'criterion': (['gini', 'entropy']),
        'max_depth': ([10,20]),
        'min_samples_split': ([2,3,5]),
        'min_samples_leaf': ([2,3,5]),
        }

    if model == 'rf':
        clf = ensemble.RandomForestClassifier()
 
        tuned_parameters = {
        'n_estimators': ([200,500,1000]),
        # 'max_features': (['auto', 'sqrt', 'log2',1,4,8]),                   # precomputed,'poly', 'sigmoid'
        'max_depth':    ([10,20]),
        # 'criterion':    (['gini', 'entropy']),
        'min_samples_split':  [2,3,5],
        'min_samples_leaf':   [2,3,5],
        }
        
    if model == 'xgboost':
        clf = XGBClassifier()

        tuned_parameters = {
        'booster': (['gbtree']),
        'max_depth':   ([5,10,20]),
        'reg_lambda': ([0,1]),
        'reg_alpha': ([0,1]),
        'subsample': ([0.5,1])
        }

    if model == 'lr':
        clf = linear_model.LogisticRegression()

        tuned_parameters = {
        'solver': (['liblinear', 'sag', 'saga'])
        }

    if model == 'cox':
       
        clf =  CoxnetSurvivalAnalysis()
        tuned_parameters = {
        'n_alphas': ([50,100,200]),
        'l1_ratio': ([0.1,0.5,1]),

        }

    if model == 'survSVM':
        clf = FastSurvivalSVM()
        
        tuned_parameters = {
        'alpha': ([0.5,1]),
        'rank_ratio': ([0.5,1]),
        'max_iter': ([20,40,80]),
        'optimizer': (['rbtree', 'avltree']),
        }

    if model == 'gb':
        clf = GradientBoostingSurvivalAnalysis()
       
        tuned_parameters = {
        'learning_rate': ([0.1, 0.3]),
        'n_estimators': ([100,200,400]),
        'max_depth': ([3,6,12])        
        }

    
    if survival == True:
        scorer = make_scorer(CI, greater_is_better=True)

        y_for_cv = np.array([t[0] for t in Y])
        cv = StratifiedKFold(y_for_cv, n_folds=2) # x-validation

    else:
        cv = StratifiedKFold(Y, n_folds=2) # x-validation
        scores = ['roc_auc']   

    print ('  ...performing x-validation')
   
    clf =  GridSearchCV(clf, tuned_parameters, scoring='%s' % scores[0], cv=cv, verbose=10) #scoring='%s' % scores[0]
    # clf = BayesSearchCV(clf, tuned_parameters, n_iter=50, cv=splits,
    #                 optimizer_kwargs=dict(acq_func='LCB', base_estimator='RF'))

    clf.fit(X, Y)

    end_svm = time.time()
    print("Total time to process: ",end_svm - start_svm)
  
    return(clf.best_params_,clf)