예제 #1
0
def test_breast_cancer_2(breast_cancer):
    X, y = breast_cancer

    tree = SurvivalTree(max_features="log2",
                        splitter="random",
                        max_depth=5,
                        min_samples_split=30,
                        min_samples_leaf=15,
                        random_state=6)
    tree.fit(X.values, y)

    assert tree.tree_.capacity == 11
    assert_array_equal(
        tree.tree_.feature,
        numpy.array([
            55, 14, TREE_UNDEFINED, 60, 23, TREE_UNDEFINED, TREE_UNDEFINED, 31,
            TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED
        ]))
    assert_array_equal(
        tree.tree_.n_node_samples,
        numpy.array([198, 153, 76, 77, 46, 16, 30, 31, 16, 15, 45]))
    assert_array_almost_equal(
        tree.tree_.threshold,
        numpy.array([
            11.3019, 9.0768, TREE_UNDEFINED, 8.6903, 6.83564, TREE_UNDEFINED,
            TREE_UNDEFINED, 10.66262, TREE_UNDEFINED, TREE_UNDEFINED,
            TREE_UNDEFINED
        ]), 5)
예제 #2
0
def test_presort(fake_data, val):
    X, y = fake_data
    tree = SurvivalTree(presort=val)

    with pytest.deprecated_call(
            match="The parameter 'presort' is deprecated "):
        tree.fit(X, y)
예제 #3
0
def test_tree_two_split(veterans):
    X, y = veterans
    X = X.loc[:, "Karnofsky_score"].values[:, numpy.newaxis]

    tree = SurvivalTree(max_depth=2, max_features=1)
    tree.fit(X, y)

    assert tree.tree_.capacity == 7
    assert_array_equal(
        tree.tree_.threshold,
        numpy.array([
            45., 25., TREE_UNDEFINED, TREE_UNDEFINED, 87.5, TREE_UNDEFINED,
            TREE_UNDEFINED
        ]))
    expected_size = numpy.array([X.shape[0], 38, 8, 30, 99, 91, 8])
    assert_array_equal(tree.tree_.n_node_samples, expected_size)

    X_pred = numpy.array(
        [66.05, 87.91, 45.62, 40.18, 50.65, 71.24, 96.21, 33.33, 11.57,
         94.28]).reshape(-1, 1)
    mrt_pred = tree.predict(X_pred)
    expected_risk = numpy.array([
        96.7044629620645, 19.6309523809524, 96.7044629620645, 179.264571990757,
        96.7044629620645, 96.7044629620645, 19.6309523809524, 179.264571990757,
        214.027380952381, 19.6309523809524
    ])
    assert_array_almost_equal(mrt_pred, expected_risk)

    chf_pred = tree.predict_cumulative_hazard_function(X_pred,
                                                       return_array=True)
    assert numpy.all(numpy.diff(chf_pred) >= 0)

    surv_pred = tree.predict_survival_function(X_pred, return_array=True)
    assert numpy.all(numpy.diff(surv_pred) <= 0)
예제 #4
0
def test_max_depth(fake_data, val):
    X, y = fake_data
    tree = SurvivalTree(max_depth=val)

    with pytest.raises(ValueError,
                       match="max_depth must be greater than zero."):
        tree.fit(X, y)
예제 #5
0
def test_min_weight_fraction_leaf(fake_data, val):
    X, y = fake_data
    tree = SurvivalTree(min_weight_fraction_leaf=val)

    with pytest.raises(ValueError,
                       match=r"min_weight_fraction_leaf must in \[0, 0\.5\]"):
        tree.fit(X, y)
예제 #6
0
def test_presort(fake_data, val):
    X, y = fake_data
    tree = SurvivalTree(presort=val)

    with pytest.warns(DeprecationWarning,
                      match="The parameter 'presort' is deprecated "):
        tree.fit(X, y)
예제 #7
0
def test_max_features_too_large(fake_data, val):
    X, y = fake_data
    tree = SurvivalTree(max_features=val)

    with pytest.raises(ValueError,
                       match=r"max_features must be in \(0, n_features\]"):
        tree.fit(X, y)
예제 #8
0
def test_breast_cancer_1(breast_cancer):
    X, y = breast_cancer

    tree = SurvivalTree(max_features="auto",
                        max_depth=5,
                        max_leaf_nodes=10,
                        min_samples_split=0.06,
                        min_samples_leaf=0.03,
                        random_state=6)
    tree.fit(X.values, y)

    assert tree.tree_.capacity == 19
    assert_array_equal(
        tree.tree_.feature,
        numpy.array([
            61, 29, 5, TREE_UNDEFINED, 40, 65, TREE_UNDEFINED, 10, 12, 4,
            TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED,
            TREE_UNDEFINED, TREE_UNDEFINED, 10, TREE_UNDEFINED, TREE_UNDEFINED
        ]))
    assert_array_equal(
        tree.tree_.n_node_samples,
        numpy.array([
            198, 170, 28, 8, 20, 164, 6, 59, 105, 74, 31, 9, 65, 13, 7, 39, 20,
            7, 13
        ]))
    assert_array_almost_equal(
        tree.tree_.threshold,
        numpy.array([
            10.97448, 11.10251, 11.34859, TREE_UNDEFINED, 10.53533, 8.08848,
            TREE_UNDEFINED, 10.86403, 10.14138, 11.49171, TREE_UNDEFINED,
            TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED,
            TREE_UNDEFINED, 11.01874, TREE_UNDEFINED, TREE_UNDEFINED
        ]), 5)
예제 #9
0
def test_breast_cancer_2():
    X, y = load_breast_cancer()
    X.loc[:, "er"] = X.loc[:, "er"].replace({"negative": 0, "positive": 1})
    X.loc[:, "grade"] = X.loc[:, "grade"].replace({
        "poorly differentiated": 0,
        "intermediate": 1,
        "well differentiated": 2,
        "unkown": 3
    })

    tree = SurvivalTree(max_features="log2",
                        splitter="random",
                        max_depth=5,
                        min_samples_split=30,
                        min_samples_leaf=15,
                        random_state=6)
    tree.fit(X.values, y)

    assert tree.tree_.capacity == 11
    assert_array_equal(
        tree.tree_.feature,
        numpy.array([
            55, 14, TREE_UNDEFINED, 60, 23, TREE_UNDEFINED, TREE_UNDEFINED, 31,
            TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED
        ]))
    assert_array_equal(
        tree.tree_.n_node_samples,
        numpy.array([198, 153, 76, 77, 46, 16, 30, 31, 16, 15, 45]))
    assert_array_almost_equal(
        tree.tree_.threshold,
        numpy.array([
            11.3019, 9.0768, TREE_UNDEFINED, 8.6903, 6.83564, TREE_UNDEFINED,
            TREE_UNDEFINED, 10.66262, TREE_UNDEFINED, TREE_UNDEFINED,
            TREE_UNDEFINED
        ]), 5)
예제 #10
0
def test_max_leaf_nodes_no_int(fake_data, val):
    X, y = fake_data
    tree = SurvivalTree(max_leaf_nodes=val)

    with pytest.raises(
            ValueError,
            match="max_leaf_nodes must be integral number but was "):
        tree.fit(X, y)
예제 #11
0
def test_presort(fake_data, val):
    X, y = fake_data
    tree = SurvivalTree(presort=val)

    with pytest.raises(
            ValueError,
            match=r"'presort' should be in \('auto', True, False\)"):
        tree.fit(X, y)
예제 #12
0
def test_max_features_invalid(fake_data, val):
    X, y = fake_data
    tree = SurvivalTree(max_features=val)

    with pytest.raises(ValueError,
                       match='Invalid value for max_features. Allowed string '
                       'values are "auto", "sqrt" or "log2".'):
        tree.fit(X, y)
예제 #13
0
def test_max_leaf_nodes_too_small(fake_data, val):
    X, y = fake_data
    tree = SurvivalTree(max_leaf_nodes=val)

    with pytest.raises(ValueError,
                       match="max_leaf_nodes {} must be either None "
                       "or larger than 1".format(val)):
        tree.fit(X, y)
예제 #14
0
def test_min_samples_leaf(fake_data, val):
    X, y = fake_data
    tree = SurvivalTree(min_samples_leaf=val)

    with pytest.raises(ValueError,
                       match=r"min_samples_leaf must be at least 1 "
                       r"or in \(0, 0\.5\], got"):
        tree.fit(X, y)
예제 #15
0
def test_tree_one_split(veterans):
    X, y = veterans
    X = X.loc[:, "Karnofsky_score"].values[:, numpy.newaxis]

    tree = SurvivalTree(max_depth=1)
    tree.fit(X, y)

    stats = LogrankTreeBuilder(max_depth=1).build(X, y)

    assert tree.tree_.capacity == stats.shape[0]
    assert_array_equal(tree.tree_.feature, stats.loc[:, "feature"].values)
    assert_array_equal(tree.tree_.n_node_samples,
                       stats.loc[:, "n_node_samples"].values)
    assert_array_almost_equal(tree.tree_.threshold,
                              stats.loc[:, "threshold"].values)

    expected_time = numpy.array([
        1, 2, 3, 4, 7, 8, 10, 11, 12, 13, 15, 16, 18, 19, 20, 21, 22, 24, 25,
        27, 29, 30, 31, 33, 35, 36, 42, 43, 44, 45, 48, 49, 51, 52, 53, 54, 56,
        59, 61, 63, 72, 73, 80, 82, 84, 87, 90, 92, 95, 99, 100, 103, 105, 110,
        111, 112, 117, 118, 122, 126, 132, 133, 139, 140, 143, 144, 151, 153,
        156, 162, 164, 177, 186, 200, 201, 216, 228, 231, 242, 250, 260, 278,
        283, 287, 314, 340, 357, 378, 384, 389, 392, 411, 467, 553, 587, 991,
        999
    ],
                                dtype=float)
    assert_array_equal(tree.event_times_, expected_time)

    threshold = stats.loc[0, "threshold"]
    m = X[:, 0] <= threshold
    y_left = y[m]
    _, chf_left = nelson_aalen_estimator(y_left["Status"],
                                         y_left["Survival_in_days"])

    y_right = y[~m]
    _, chf_right = nelson_aalen_estimator(y_right["Status"],
                                          y_right["Survival_in_days"])

    X_pred = numpy.array([[threshold - 10], [threshold + 10]])
    chf_pred = tree.predict_cumulative_hazard_function(X_pred,
                                                       return_array=True)

    assert_curve_almost_equal(chf_pred[0], chf_left)
    assert_curve_almost_equal(chf_pred[1], chf_right)

    mrt_pred = tree.predict(X_pred)
    assert_array_almost_equal(mrt_pred, numpy.array([196.55878, 86.14939]))

    _, surv_left = kaplan_meier_estimator(y_left["Status"],
                                          y_left["Survival_in_days"])
    _, surv_right = kaplan_meier_estimator(y_right["Status"],
                                           y_right["Survival_in_days"])

    surv_pred = tree.predict_survival_function(X_pred, return_array=True)

    assert_curve_almost_equal(surv_pred[0], surv_left)
    assert_curve_almost_equal(surv_pred[1], surv_right)
예제 #16
0
def test_min_samples_split(fake_data, val):
    X, y = fake_data
    tree = SurvivalTree(min_samples_split=val)

    with pytest.raises(ValueError,
                       match="min_samples_split must be an integer "
                       r"greater than 1 or a float in \(0\.0, 1\.0\]; "
                       "got "):
        tree.fit(X, y)
예제 #17
0
def test_predict_wrong_features(toy_data, n_features):
    X, y = toy_data
    tree = SurvivalTree(max_depth=1)
    tree.fit(X, y)

    with pytest.raises(ValueError,
                       match="X has {} features, but SurvivalTree is "
                       "expecting 4 features as input.".format(n_features)):
        X_new = numpy.random.randn(12, n_features)
        tree.predict(X_new)
예제 #18
0
def test_predict_wrong_features(toy_data, n_features):
    X, y = toy_data
    tree = SurvivalTree(max_depth=1)
    tree.fit(X, y)

    with pytest.raises(ValueError,
                       match="Number of features of the model must "
                       "match the input. Model n_features is 4 and "
                       "input n_features is {}.".format(n_features)):
        X_new = numpy.random.randn(12, n_features)
        tree.predict(X_new)
예제 #19
0
def test_predict_step_function_warning(toy_data, func):
    X, y = toy_data
    tree = SurvivalTree(max_depth=1)
    tree.fit(X, y)

    pred_fn = getattr(tree, func)

    with pytest.warns(
            FutureWarning,
            match="{} will return an array of StepFunction instances in 0.14".
            format(func)):
        pred_fn(X)
예제 #20
0
def test_toy_data(toy_data):
    X, y = toy_data
    tree = SurvivalTree(max_depth=4, max_features=1.0, min_samples_leaf=20)
    tree.fit(X, y)

    stats = LogrankTreeBuilder(max_depth=4, min_leaf=20).build(X, y)

    assert tree.tree_.capacity == stats.shape[0]
    assert_array_equal(tree.tree_.feature, stats.loc[:, "feature"].values)
    assert_array_equal(tree.tree_.n_node_samples,
                       stats.loc[:, "n_node_samples"].values)
    assert_array_almost_equal(tree.tree_.threshold,
                              stats.loc[:, "threshold"].values, 5)
예제 #21
0
def test_X_idx_sorted(fake_data, val):
    X, y = fake_data
    tree = SurvivalTree()

    if val == "sort":
        X_idx_sorted = numpy.argsort(X, axis=0)
    else:
        X_idx_sorted = val

    with pytest.warns(
            FutureWarning,
            match=
            "The parameter 'X_idx_sorted' is deprecated and has no effect."):
        tree.fit(X, y, X_idx_sorted=X_idx_sorted)
예제 #22
0
def test_tree_split_all_censored(veterans):
    X, y = veterans
    X = X.loc[:, "Karnofsky_score"].values[:, numpy.newaxis]
    y["Status"][X[:, 0] > 45.] = False

    tree = SurvivalTree(max_depth=2, max_features=1)
    tree.fit(X, y)

    assert tree.tree_.capacity == 5
    assert_array_equal(
        tree.tree_.threshold,
        numpy.array([45., 25., TREE_UNDEFINED, TREE_UNDEFINED,
                     TREE_UNDEFINED]))
    expected_size = numpy.array([X.shape[0], 38, 8, 30, 99])
    assert_array_equal(tree.tree_.n_node_samples, expected_size)
예제 #23
0
def test_predict_step_function(breast_cancer, func):
    X, y = breast_cancer

    tree = SurvivalTree(max_features="log2",
                        splitter="random",
                        max_depth=5,
                        min_samples_split=30,
                        min_samples_leaf=15,
                        random_state=6)
    tree.fit(X.iloc[10:], y[10:])

    pred_fn = getattr(tree, func)

    ret_array = pred_fn(X.iloc[:10], return_array=True)
    fn_array = pred_fn(X.iloc[:10], return_array=False)

    assert ret_array.shape[0] == fn_array.shape[0]

    for fn, arr in zip(fn_array, ret_array):
        assert_array_almost_equal(fn.x, tree.event_times_)
        assert_array_almost_equal(fn.y, arr)
예제 #24
0
def test_breast_cancer_1():
    X, y = load_breast_cancer()
    X.loc[:, "er"] = X.loc[:, "er"].replace({"negative": 0, "positive": 1})
    X.loc[:, "grade"] = X.loc[:, "grade"].replace({
        "poorly differentiated": 0,
        "intermediate": 1,
        "well differentiated": 2,
        "unkown": 3
    })

    tree = SurvivalTree(max_features="auto",
                        max_depth=5,
                        max_leaf_nodes=10,
                        min_samples_split=0.06,
                        min_samples_leaf=0.03,
                        random_state=6)
    tree.fit(X.values, y)

    assert tree.tree_.capacity == 19
    assert_array_equal(
        tree.tree_.feature,
        numpy.array([
            61, 29, 5, TREE_UNDEFINED, 40, 65, TREE_UNDEFINED, 10, 12, 4,
            TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED,
            TREE_UNDEFINED, TREE_UNDEFINED, 10, TREE_UNDEFINED, TREE_UNDEFINED
        ]))
    assert_array_equal(
        tree.tree_.n_node_samples,
        numpy.array([
            198, 170, 28, 8, 20, 164, 6, 59, 105, 74, 31, 9, 65, 13, 7, 39, 20,
            7, 13
        ]))
    assert_array_almost_equal(
        tree.tree_.threshold,
        numpy.array([
            10.97448, 11.10251, 11.34859, TREE_UNDEFINED, 10.53533, 8.08848,
            TREE_UNDEFINED, 10.86403, 10.14138, 11.49171, TREE_UNDEFINED,
            TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED,
            TREE_UNDEFINED, 11.01874, TREE_UNDEFINED, TREE_UNDEFINED
        ]), 5)