Пример #1
0
    def test_squared_loss_staged_predict(self):
        # Test whether staged decision function eventually gives
        # the same prediction.
        model = GradientBoostingSurvivalAnalysis(loss="squared",
                                                 n_estimators=100,
                                                 max_depth=3,
                                                 random_state=0)
        model.fit(self.x, self.y)

        y_pred = model.predict(self.x)

        # test if prediction for last stage equals ``predict``
        for y in model.staged_predict(self.x):
            self.assertTupleEqual(y.shape, y_pred.shape)

        assert_array_equal(y_pred, y)

        model.set_params(dropout_rate=0.03)
        model.fit(self.x, self.y)

        y_pred = model.predict(self.x)

        # test if prediction for last stage equals ``predict``
        for y in model.staged_predict(self.x):
            self.assertTupleEqual(y.shape, y_pred.shape)

        assert_array_equal(y_pred, y)
Пример #2
0
    def test_squared_loss_staged_predict(make_whas500):
        whas500_data = make_whas500(with_std=False, to_numeric=True)

        # Test whether staged decision function eventually gives
        # the same prediction.
        model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0)
        model.fit(whas500_data.x, whas500_data.y)

        y_pred = model.predict(whas500_data.x)

        # test if prediction for last stage equals ``predict``
        for y in model.staged_predict(whas500_data.x):
            assert y.shape == y_pred.shape

        assert_array_equal(y_pred, y)

        model.set_params(dropout_rate=0.03)
        model.fit(whas500_data.x, whas500_data.y)

        y_pred = model.predict(whas500_data.x)

        # test if prediction for last stage equals ``predict``
        for y in model.staged_predict(whas500_data.x):
            assert y.shape == y_pred.shape

        assert_array_equal(y_pred, y)
Пример #3
0
    def test_fit(make_whas500):
        whas500_data = make_whas500(with_std=False, to_numeric=True)

        model = GradientBoostingSurvivalAnalysis(n_estimators=100,
                                                 max_depth=3,
                                                 min_samples_split=10,
                                                 random_state=0)
        model.fit(whas500_data.x, whas500_data.y)

        assert model.max_features_ == 14
        assert not hasattr(model, "oob_improvement_")

        p = model.predict(whas500_data.x)

        assert_cindex_almost_equal(whas500_data.y['fstat'],
                                   whas500_data.y['lenfol'], p,
                                   (0.86272605091218779, 64826, 10309, 14, 14))

        assert (100, ) == model.train_score_.shape

        with pytest.raises(
                ValueError,
                match="Number of features of the model must match the input. "
                "Model n_features is 14 and input n_features is 2 "):
            model.predict(whas500_data.x[:, :2])
Пример #4
0
    def test_fit_subsample(make_whas500):
        whas500_data = make_whas500(with_std=False, to_numeric=True)

        model = GradientBoostingSurvivalAnalysis(n_estimators=50, max_features=8, subsample=0.6,
                                                 presort=False, random_state=0)
        model.fit(whas500_data.x, whas500_data.y)

        assert model.max_features_ == 8
        assert hasattr(model, "oob_improvement_")

        incl_mask = numpy.ones(whas500_data.x.shape[0], dtype=bool)
        incl_mask[[35, 111, 174, 206, 236, 268, 497]] = False
        x_test = whas500_data.x[incl_mask]
        y_test = whas500_data.y[incl_mask]

        p = model.predict(x_test)

        assert_cindex_almost_equal(y_test['fstat'], y_test['lenfol'], p,
                                   (0.8330510326740247, 60985, 12221, 2, 110))

        assert (50,) == model.train_score_.shape
        assert (50,) == model.oob_improvement_.shape

        with pytest.raises(ValueError, match="Number of features of the model must match the input. "
                                             "Model n_features is 14 and input n_features is 2 "):
            model.predict(whas500_data.x[:, :2])
Пример #5
0
    def test_dropout(whas500_sparse_data, loss):
        model = GradientBoostingSurvivalAnalysis(loss=loss, n_estimators=100, max_depth=1, min_samples_split=10,
                                                 dropout_rate=0.03, random_state=0)
        model.fit(whas500_sparse_data.x_sparse, whas500_sparse_data.y)

        assert model.estimators_.shape[0] == 100
        assert model.train_score_.shape == (100,)

        sparse_predict = model.predict(whas500_sparse_data.x_dense)

        model.fit(whas500_sparse_data.x_dense, whas500_sparse_data.y)
        dense_predict = model.predict(whas500_sparse_data.x_dense)

        assert_array_almost_equal(sparse_predict, dense_predict)
Пример #6
0
    def test_dropout(self):
        for loss in ('coxph', 'squared', 'ipcwls'):
            model = GradientBoostingSurvivalAnalysis(loss=loss, n_estimators=100, max_depth=1, min_samples_split=10,
                                                     dropout_rate=0.03, random_state=0)
            model.fit(self.x_sparse, self.y)

            self.assertEqual(model.estimators_.shape[0], 100)
            self.assertTupleEqual(model.train_score_.shape, (100,))

            sparse_predict = model.predict(self.x_dense)

            model.fit(self.x_dense, self.y)
            dense_predict = model.predict(self.x_dense)

            assert_array_almost_equal(sparse_predict, dense_predict)
Пример #7
0
    def test_fit_subsample(self):
        model = GradientBoostingSurvivalAnalysis(n_estimators=100,
                                                 max_features=8,
                                                 subsample=0.6,
                                                 random_state=0)
        model.fit(self.x, self.y)

        self.assertEquals(model.max_features_, 8)
        self.assertTrue(hasattr(model, "oob_improvement_"))

        incl_mask = numpy.ones(self.x.shape[0], dtype=bool)
        incl_mask[[35, 111, 174, 206, 236, 268, 497]] = False
        x_test = self.x[incl_mask]
        y_test = self.y[incl_mask]

        p = model.predict(x_test)

        expected_cindex = numpy.array([0.8592640, 62905, 10303, 0, 110])
        result = concordance_index_censored(y_test['fstat'], y_test['lenfol'],
                                            p)
        assert_array_almost_equal(expected_cindex, numpy.array(result))

        self.assertTupleEqual((100, ), model.train_score_.shape)
        self.assertTupleEqual((100, ), model.oob_improvement_.shape)

        self.assertRaisesRegex(
            ValueError,
            "Number of features of the model must match the input. "
            "Model n_features is 14 and input n_features is 2 ", model.predict,
            self.x[:, :2])
Пример #8
0
    def test_fit(self):
        model = GradientBoostingSurvivalAnalysis(n_estimators=100,
                                                 max_depth=3,
                                                 min_samples_split=10,
                                                 random_state=0)
        model.fit(self.x, self.y)

        self.assertEquals(model.max_features_, 14)
        self.assertFalse(hasattr(model, "oob_improvement_"))

        p = model.predict(self.x)

        expected_cindex = numpy.array(
            [0.86272605091218779, 64826, 10309, 14, 119])
        result = concordance_index_censored(self.y['fstat'], self.y['lenfol'],
                                            p)
        assert_array_almost_equal(expected_cindex, numpy.array(result))

        self.assertTupleEqual((100, ), model.train_score_.shape)

        self.assertRaisesRegex(
            ValueError,
            "Number of features of the model must match the input. "
            "Model n_features is 14 and input n_features is 2 ", model.predict,
            self.x[:, :2])
Пример #9
0
    def test_fit_int_param_as_float(make_whas500):
        whas500_data = make_whas500(with_std=False, to_numeric=True)

        if _sklearn_version_under_0p21:
            max_depth = 3
        else:
            # Account for https://github.com/scikit-learn/scikit-learn/pull/12344
            max_depth = 4

        model = GradientBoostingSurvivalAnalysis(n_estimators=100.0,
                                                 max_depth=float(max_depth),
                                                 min_samples_split=10.0,
                                                 random_state=0)
        params = model.get_params()
        assert 100 == params["n_estimators"]
        assert max_depth == params["max_depth"]
        assert 10 == params["min_samples_split"]

        model.set_params(max_leaf_nodes=15.0)
        assert 15 == model.get_params()["max_leaf_nodes"]

        model.fit(whas500_data.x, whas500_data.y)
        p = model.predict(whas500_data.x)

        assert_cindex_almost_equal(whas500_data.y['fstat'],
                                   whas500_data.y['lenfol'], p,
                                   (0.90256690042449006, 67826, 7321, 2, 14))
Пример #10
0
    def test_squared_loss(make_whas500):
        whas500_data = make_whas500(with_std=False, to_numeric=True)

        model = GradientBoostingSurvivalAnalysis(loss="squared",
                                                 n_estimators=100,
                                                 max_depth=3,
                                                 random_state=0)
        model.fit(whas500_data.x, whas500_data.y)

        time_predicted = model.predict(whas500_data.x)
        time_true = whas500_data.y["lenfol"]
        event_true = whas500_data.y["fstat"]

        rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted))
        assert round(abs(rmse_all - 580.23345259002951), 7) == 0

        rmse_uncensored = numpy.sqrt(
            mean_squared_error(time_true[event_true],
                               time_predicted[event_true]))
        assert round(abs(rmse_uncensored - 383.10639243317951), 7) == 0

        cindex = model.score(whas500_data.x, whas500_data.y)
        assert round(abs(cindex - 0.9021810004), 7) == 0

        with pytest.raises(
                ValueError,
                match="`fit` must be called with the loss option set to 'coxph'"
        ):
            model.predict_survival_function(whas500_data.x)

        with pytest.raises(
                ValueError,
                match="`fit` must be called with the loss option set to 'coxph'"
        ):
            model.predict_cumulative_hazard_function(whas500_data.x)
Пример #11
0
    def test_squared_loss(self):
        model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0)
        model.fit(self.x, self.y)

        time_predicted = model.predict(self.x)
        time_true = self.y["lenfol"]
        event_true = self.y["fstat"]

        rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted))
        self.assertAlmostEqual(rmse_all, 580.23345259002951)

        rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true]))
        self.assertAlmostEqual(rmse_uncensored, 383.10639243317951)
Пример #12
0
    def test_ipcwls_loss(self):
        model = GradientBoostingSurvivalAnalysis(loss="ipcwls", n_estimators=100, max_depth=3, random_state=0)
        model.fit(self.x, self.y)

        time_predicted = model.predict(self.x)
        time_true = self.y["lenfol"]
        event_true = self.y["fstat"]

        rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted))
        self.assertAlmostEqual(rmse_all, 590.5441693629117)

        rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true]))
        self.assertAlmostEqual(rmse_uncensored, 392.97741487479743)
Пример #13
0
    def test_fit_dropout(self):
        model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_features=8,
                                                 learning_rate=1.0, dropout_rate=0.03,
                                                 random_state=0)
        model.fit(self.x, self.y)

        self.assertFalse(hasattr(model, "oob_improvement_"))
        self.assertEquals(model.max_features_, 8)

        p = model.predict(self.x)

        expected_cindex = numpy.array([0.9094333, 68343, 6806, 0, 119])
        result = concordance_index_censored(self.y['fstat'], self.y['lenfol'], p)
        assert_array_almost_equal(expected_cindex, numpy.array(result))
Пример #14
0
    def test_fit_dropout(make_whas500):
        whas500_data = make_whas500(with_std=False, to_numeric=True)

        model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_features=8,
                                                 learning_rate=1.0, dropout_rate=0.03,
                                                 random_state=0)
        model.fit(whas500_data.x, whas500_data.y)

        assert not hasattr(model, "oob_improvement_")
        assert model.max_features_ == 8

        p = model.predict(whas500_data.x)

        assert_cindex_almost_equal(whas500_data.y['fstat'], whas500_data.y['lenfol'], p,
                                   (0.9094333, 68343, 6806, 0, 119))
Пример #15
0
    def test_squared_loss(make_whas500):
        whas500_data = make_whas500(with_std=False, to_numeric=True)

        model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0)
        model.fit(whas500_data.x, whas500_data.y)

        time_predicted = model.predict(whas500_data.x)
        time_true = whas500_data.y["lenfol"]
        event_true = whas500_data.y["fstat"]

        rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted))
        assert round(abs(rmse_all - 580.23345259002951), 7) == 0

        rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true]))
        assert round(abs(rmse_uncensored - 383.10639243317951), 7) == 0
Пример #16
0
    def test_ipcwls_loss(make_whas500):
        whas500_data = make_whas500(with_std=False, to_numeric=True)

        model = GradientBoostingSurvivalAnalysis(loss="ipcwls", n_estimators=100, max_depth=3, random_state=0)
        model.fit(whas500_data.x, whas500_data.y)

        time_predicted = model.predict(whas500_data.x)
        time_true = whas500_data.y["lenfol"]
        event_true = whas500_data.y["fstat"]

        rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted))
        assert round(abs(rmse_all - 590.5441693629117), 7) == 0

        rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true]))
        assert round(abs(rmse_uncensored - 392.97741487479743), 7) == 0
Пример #17
0
    def test_fit_int_param_as_float(self):
        model = GradientBoostingSurvivalAnalysis(n_estimators=100.0, max_depth=3.0, min_samples_split=10.0,
                                                 random_state=0)
        params = model.get_params()
        self.assertEqual(100, params["n_estimators"])
        self.assertEqual(3, params["max_depth"])
        self.assertEqual(10, params["min_samples_split"])

        model.set_params(max_leaf_nodes=15.0)
        self.assertEqual(15, model.get_params()["max_leaf_nodes"])

        model.fit(self.x, self.y)
        p = model.predict(self.x)

        expected_cindex = numpy.array([0.90256690042449006, 67826, 7321, 2, 119])
        result = concordance_index_censored(self.y['fstat'], self.y['lenfol'], p)
        assert_array_almost_equal(expected_cindex, numpy.array(result))
Пример #18
0
    def test_fit_int_param_as_float(make_whas500):
        whas500_data = make_whas500(with_std=False, to_numeric=True)

        model = GradientBoostingSurvivalAnalysis(n_estimators=100.0, max_depth=3.0, min_samples_split=10.0,
                                                 random_state=0)
        params = model.get_params()
        assert 100 == params["n_estimators"]
        assert 3 == params["max_depth"]
        assert 10 == params["min_samples_split"]

        model.set_params(max_leaf_nodes=15.0)
        assert 15 == model.get_params()["max_leaf_nodes"]

        model.fit(whas500_data.x, whas500_data.y)
        p = model.predict(whas500_data.x)

        assert_cindex_almost_equal(whas500_data.y['fstat'], whas500_data.y['lenfol'], p,
                                   (0.90256690042449006, 67826, 7321, 2, 119))
Пример #19
0
    def test_fit_subsample(self):
        model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_features=8, subsample=0.6,
                                                 random_state=0)
        model.fit(self.x, self.y)

        self.assertEquals(model.max_features_, 8)
        self.assertTrue(hasattr(model, "oob_improvement_"))

        p = model.predict(self.x)

        expected_cindex = numpy.array([0.8610760, 64709, 10440, 0, 119])
        result = concordance_index_censored(self.y['fstat'], self.y['lenfol'], p)
        assert_array_almost_equal(expected_cindex, numpy.array(result))

        self.assertTupleEqual((100,), model.train_score_.shape)
        self.assertTupleEqual((100,), model.oob_improvement_.shape)

        self.assertRaisesRegex(ValueError, "Number of features of the model must match the input. "
                                           "Model n_features is 14 and input n_features is 2 ",
                               model.predict, self.x[:, :2])
Пример #20
0
def train_gbmsurv(population = None, plpData= None, train = True, modelOutput =None, loss='coxph', learning_rate=0.1, n_estimators=100, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_split=None, min_impurity_decrease=0.0, max_features=None, max_leaf_nodes=None, subsample=1.0, dropout_rate=0.0, verbose=0, seed = 1, quiet = True):
  print("Training python scikit-survial GradientBoostingSurvivalAnalysis model" )
  ytype=np.dtype([('outcome', '?'), ('surv', 'i')])
  y=np.empty(len(population[:,1]),dtype=ytype)
  y['outcome']= population[:,1]>0
  y['surv']= population[:,2]
  X = plpData[population[:,0],:]
  trainInds =population[:,population.shape[1]-1] >0
  print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
  print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
  ###########################################################################
  if train:
    pred_size = int(np.sum(population[:,population.shape[1]-1] > 0))
    print("Calculating prediction for train set of size %s" %(pred_size))
    test_pred = np.zeros(pred_size)# zeros length sum(population[:,population.size[1]] ==i)
    for i in range(1, int(np.max(population[:,population.shape[1]-1])+1), 1):
      testInd =population[population[:,population.shape[1]-1] > 0,population.shape[1]-1] ==i
      trainInd = (population[population[:,population.shape[1]-1] > 0,population.shape[1]-1] !=i)
      train_x = X[trainInds,:][trainInd,:]
      train_y = y[trainInds,][trainInd,]
      test_x = X[trainInds,:][testInd,:]	
      print("Fold %s split %s in train set and %s in test set" %(i, train_x.shape[0], test_x.shape[0]))
      print("Train set contains %s outcomes " %(np.sum(train_y['outcome'])))
      print("Training fold %s" %(i))
      start_time = timeit.default_timer()	
      gbmsurv = GradientBoostingSurvivalAnalysis(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, min_impurity_split=min_impurity_split, min_impurity_decrease=min_impurity_decrease, random_state=seed, max_features=max_features, max_leaf_nodes=max_leaf_nodes, subsample=subsample, dropout_rate=dropout_rate, verbose=verbose)
      gbmsurv = gbmsurv.fit(X=train_x, y=train_y)
      end_time = timeit.default_timer()
      print("Training fold took: %.2f s" %(end_time-start_time))
      print("Calculating predictions on left out fold set...")
      ind = (population[:,population.shape[1]-1] > 0)
      ind = population[ind,population.shape[1]-1]==i
      rowCount = np.sum(ind)
      temp_pred = gbmsurv.predict(test_x.toarray())
      temp_pred = temp_pred.flatten()
      temp_pred = temp_pred[0:(rowCount)]
      test_pred[ind] = temp_pred
      print("Prediction complete: %s rows " %(np.shape(test_pred[ind])[0]))
      print("Mean: %s prediction value" %(np.mean(test_pred[ind])))
    # merge pred with indexes[testInd,:]
    test_pred.shape = (population[population[:,population.shape[1]-1] > 0,:].shape[0], 1)
    prediction = np.append(population[population[:,population.shape[1]-1] > 0,:],test_pred, axis=1)
    return prediction;
  # train final:
  else:
    print("Training final python scikit-survial GradientBoostingSurvivalAnalysis model on all train data...")
    print("X- %s rows and Y %s length" %(X[trainInds,:].shape[0], y[trainInds].shape[0]))
    start_time = timeit.default_timer()	
    gbmsurv = GradientBoostingSurvivalAnalysis(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, min_impurity_split=min_impurity_split, min_impurity_decrease=min_impurity_decrease, random_state=seed, max_features=max_features, max_leaf_nodes=max_leaf_nodes, subsample=subsample, dropout_rate=dropout_rate, verbose=verbose)
    gbmsurv = gbmsurv.fit(X[trainInds,:], y[trainInds])
    end_time = timeit.default_timer()
    print("Training final took: %.2f s" %(end_time-start_time))
    # save the model:
    if not os.path.exists(modelOutput):
      os.makedirs(modelOutput)
    print("Model saved to: %s" %(modelOutput)	)
    joblib.dump(gbmsurv, os.path.join(modelOutput,"model.pkl")) 
    pred = gbmsurv.predict(X[trainInds,:].toarray())
    pred = pred.flatten()
    rowCount = np.sum(trainInds)
    pred = pred[0:(rowCount)]
    pred.shape = (population[population[:,population.shape[1]-1] > 0,:].shape[0], 1)
    prediction = np.append(population[population[:,population.shape[1]-1] > 0,:],pred, axis=1)
    return prediction, gbmsurv.feature_importances_;