def test_fit_with_small_max_samples(make_whas500):
    whas500 = make_whas500(to_numeric=True)

    # First fit with no restriction on max samples
    est1 = RandomSurvivalForest(
        n_estimators=1,
        random_state=1,
        max_samples=None,
    )

    # Second fit with max samples restricted to just 2
    est2 = RandomSurvivalForest(
        n_estimators=1,
        random_state=1,
        max_samples=2,
    )

    est1.fit(whas500.x, whas500.y)
    est2.fit(whas500.x, whas500.y)

    tree1 = est1.estimators_[0].tree_
    tree2 = est2.estimators_[0].tree_

    msg = "Tree without `max_samples` restriction should have more nodes"
    assert tree1.node_count > tree2.node_count, msg
def test_pipeline_predict(breast_cancer, func):
    X_str, _ = load_breast_cancer()
    X_num, y = breast_cancer

    est = RandomSurvivalForest(n_estimators=10, random_state=1)
    est.fit(X_num[10:], y[10:])

    pipe = make_pipeline(OneHotEncoder(),
                         RandomSurvivalForest(n_estimators=10, random_state=1))
    pipe.fit(X_str[10:], y[10:])

    tree_pred = getattr(est, func)(X_num[:10], return_array=True)
    pipe_pred = getattr(pipe, func)(X_str[:10], return_array=True)

    assert_array_almost_equal(tree_pred, pipe_pred)
Exemplo n.º 3
0
def test_fit_warm_start(make_whas500):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(n_estimators=11, max_depth=2, random_state=2)
    forest.fit(whas500.x, whas500.y)

    assert len(forest.estimators_) == 11
    assert all((e.max_depth == 2 for e in forest.estimators_))

    forest.set_params(warm_start=True)
    with pytest.warns(UserWarning,
                      match="Warm-start fitting without increasing "
                      "n_estimators does not fit new trees."):
        forest.fit(whas500.x, whas500.y)

    forest.set_params(n_estimators=3)
    with pytest.raises(ValueError,
                       match="n_estimators=3 must be larger or equal to "
                       r"len\(estimators_\)=11 when warm_start==True"):
        forest.fit(whas500.x, whas500.y)

    forest.set_params(n_estimators=23)
    forest.fit(whas500.x, whas500.y)

    assert len(forest.estimators_) == 23
    assert all((e.max_depth == 2 for e in forest.estimators_))
def test_fit_int_time(make_whas500):
    whas500 = make_whas500(to_numeric=True)
    y = whas500.y
    y_int = numpy.empty(y.shape[0],
                        dtype=[(y.dtype.names[0], bool),
                               (y.dtype.names[1], int)])
    y_int[:] = y

    forest_f = RandomSurvivalForest(oob_score=True, random_state=2).fit(
        whas500.x[50:], y[50:])
    forest_i = RandomSurvivalForest(oob_score=True, random_state=2).fit(
        whas500.x[50:], y_int[50:])

    assert len(forest_f.estimators_) == len(forest_i.estimators_)
    assert forest_f.n_features_ == forest_i.n_features_
    assert forest_f.oob_score_ == forest_i.oob_score_
    assert_array_almost_equal(forest_f.event_times_, forest_i.event_times_)

    pred_f = forest_f.predict(whas500.x[:50])
    pred_i = forest_i.predict(whas500.x[:50])

    assert_array_almost_equal(pred_f, pred_i)
Exemplo n.º 5
0
def test_fit_predict(make_whas500):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(random_state=2)
    forest.fit(whas500.x, whas500.y)

    assert len(forest.estimators_) == 100

    pred = forest.predict(whas500.x)
    assert numpy.isfinite(pred).all()
    assert numpy.all(pred >= 0)

    expected_c = (0.9026201280123488, 67831, 7318, 0, 14)
    assert_cindex_almost_equal(whas500.y["fstat"], whas500.y["lenfol"], pred,
                               expected_c)
Exemplo n.º 6
0
def test_predict_step_function_warning(make_whas500, func):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(n_estimators=3,
                                  oob_score=True,
                                  random_state=2)
    forest.fit(whas500.x, whas500.y)

    pred_fn = getattr(forest, func)

    with pytest.warns(
            FutureWarning,
            match="{} will return an array of StepFunction instances in 0.14".
            format(func)):
        pred_fn(whas500.x)
Exemplo n.º 7
0
def test_oob_score(make_whas500):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(oob_score=True,
                                  bootstrap=False,
                                  random_state=2)
    with pytest.raises(ValueError,
                       match="Out of bag estimation only available "
                       "if bootstrap=True"):
        forest.fit(whas500.x, whas500.y)

    forest.set_params(bootstrap=True)
    forest.fit(whas500.x, whas500.y)

    assert forest.oob_prediction_.shape == (whas500.x.shape[0], )
    assert round(abs(forest.oob_score_ - 0.753010685), 6) == 0.0
def test_predict_step_function(make_whas500, func):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(n_estimators=10, random_state=2)
    forest.fit(whas500.x[10:], whas500.y[10:])

    pred_fn = getattr(forest, func)

    ret_array = pred_fn(whas500.x[:10], return_array=True)
    fn_array = pred_fn(whas500.x[:10], return_array=False)

    assert ret_array.shape[0] == fn_array.shape[0]

    for fn, arr in zip(fn_array, ret_array):
        assert_array_almost_equal(fn.x, forest.event_times_)
        assert_array_almost_equal(fn.y, arr)
Exemplo n.º 9
0
def test_fit_predict_chf(make_whas500):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(n_estimators=10, random_state=2)
    forest.fit(whas500.x, whas500.y)

    assert len(forest.estimators_) == 10

    chf = forest.predict_cumulative_hazard_function(whas500.x)
    assert chf.shape == (500, forest.event_times_.shape[0])

    assert numpy.isfinite(chf).all()
    assert numpy.all(chf >= 0.0)

    vals, counts = numpy.unique(chf[:, 0], return_counts=True)
    assert vals[0] == 0.0
    assert numpy.max(counts) == counts[0]

    d = numpy.apply_along_axis(numpy.diff, 1, chf)
    assert (d >= 0).all()
Exemplo n.º 10
0
def test_fit_predict_surv(make_whas500):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(n_estimators=10, random_state=2)
    forest.fit(whas500.x, whas500.y)

    assert len(forest.estimators_) == 10

    surv = forest.predict_survival_function(whas500.x)
    assert surv.shape == (500, forest.event_times_.shape[0])

    assert numpy.isfinite(surv).all()
    assert numpy.all(surv >= 0.0)
    assert numpy.all(surv <= 1.0)

    vals, counts = numpy.unique(surv[:, 0], return_counts=True)
    assert vals[-1] == 1.0
    assert numpy.max(counts) == counts[-1]

    d = numpy.apply_along_axis(numpy.diff, 1, surv)
    assert (d <= 0).all()
Exemplo n.º 11
0
def test_fit_max_samples(make_whas500, max_samples, exc_type, exc_msg):
    whas500 = make_whas500(to_numeric=True)
    forest = RandomSurvivalForest(max_samples=max_samples)
    with pytest.raises(exc_type, match=exc_msg):
        forest.fit(whas500.x, whas500.y)
def RSF_bootstrap(fp, num=False):
    df = pd.read_csv(fp, index_col=0)

    # configure bootstrap (sampling 50% of data)
    n_iterations = 100
    n_size = int(len(df) * 0.50)

    # parameters
    NUMESTIMATORS = 100
    TESTSIZE = 0.20
    random_state = 20

    # calculate population of statistics
    metrics = []
    for i in range(n_iterations):
        # prepare sample

        # if indicated, include number of mets (col 42)
        if num:
            sample = resample(df.iloc[:, np.r_[:20, 40, 41, 42]],
                              n_samples=n_size)
            X = sample.iloc[:, np.r_[:20, 42]].copy()

        else:
            sample = resample(df.iloc[:, np.r_[:20, 40, 41]], n_samples=n_size)
            X = sample.iloc[:, :20].copy()

        X = X.to_numpy().astype('float64')
        y = sample[['Event', 'Time']].copy()
        y['Event'] = y['Event'].astype('bool')
        y['Time'] = y['Time'].astype('float64')
        y = y.to_records(index=False)

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=TESTSIZE, random_state=random_state)
        rsf = RandomSurvivalForest(n_estimators=NUMESTIMATORS,
                                   min_samples_split=15,
                                   min_samples_leaf=8,
                                   max_features="sqrt",
                                   n_jobs=-1,
                                   random_state=random_state)
        rsf.fit(X_train, y_train)

        score = rsf.score(X_test, y_test)
        metrics.append(score)

    # calculate confidence interval
    alpha = 0.95
    p = ((1.0 - alpha) / 2.0) * 100
    lower = max(0.0, np.percentile(metrics, p))
    p = (alpha + ((1.0 - alpha) / 2.0)) * 100
    upper = min(1.0, np.percentile(metrics, p))
    med = np.percentile(metrics, 50)

    # identify aggregation method name
    if num:
        name = fp.split('/')[-1].split('_')[0] + ' + NumMets'
    else:
        name = fp.split('/')[-1].split('_')[0]

    return print(name, 'RSF', '%.3f (%.3f-%.3f)' % (med, lower, upper))