示例#1
0
def test_OutSamplerTransformer_classifier_fit_transform(multi_output):

    X = np.random.randn(100, 10)
    if multi_output:
        y = 1 * (np.random.randn(100, 2) > 0)
    else:
        y = 1 * (np.random.randn(100) > 0)

    cv = KFold(n_splits=10, shuffle=True, random_state=123)

    model = OutSamplerTransformer(RandomForestClassifier(n_estimators=10,
                                                         random_state=123),
                                  cv=cv)

    model.fit(X, y)
    y1 = model.transform(X)

    model = OutSamplerTransformer(RandomForestClassifier(n_estimators=10,
                                                         random_state=123),
                                  cv=cv)
    y2 = model.fit_transform(X, y)

    assert np.abs(y1 -
                  y2).flatten().max() >= 0.01  # vector should be different
示例#2
0
def test_OutSamplerTransformer_regressor():

    np.random.seed(123)
    X = np.random.randn(100, 10)
    y = np.random.randn(100)

    model = OutSamplerTransformer(RandomForestRegressor(n_estimators=10,
                                                        random_state=123),
                                  cv=10)
    model.fit(X, y)

    y1 = model.model.predict(X)
    y2 = model.transform(X)

    assert not is_classifier(model)
    assert not is_regressor(model)

    assert np.abs(y1 - y2[:, 0]).max() <= 10**(-10)
    assert y2.shape == (100, 1)

    assert model.get_feature_names() == ["RandomForestRegressor__target"]
示例#3
0
def test_OutSamplerTransformer_classifier(multi_output):

    np.random.seed(123)
    X = np.random.randn(100, 10)
    if multi_output:
        y = 1 * (np.random.randn(100, 2) > 0)
    else:
        y = 1 * (np.random.randn(100) > 0)

    model = OutSamplerTransformer(
        RandomForestClassifier(n_estimators=10, random_state=123))
    model.fit(X, y)

    p1 = model.model.predict_proba(X)
    p2 = model.transform(X)

    assert not is_classifier(model)
    assert not is_regressor(model)

    if multi_output:

        assert np.abs(p1[0][:, 1] - p2[:, 0]).max() <= 10**(-10)
        assert np.abs(p1[1][:, 1] - p2[:, 1]).max() <= 10**(-10)
    else:
        assert np.abs(p1[:, 1] - p2[:, 0]).max() <= 10**(-10)
    assert p2.shape == (100, 1 + 1 * multi_output)

    if multi_output:
        assert model.get_feature_names() == [
            "output%d__RandomForestClassifier__1" % d
            for d in range(y.shape[1])
        ]
    else:
        assert model.get_feature_names() == ["RandomForestClassifier__1"]

    ### Test with strings
    if multi_output:
        y = np.array(["a", "b", "c"])[np.random.randint(0, 3, 200).reshape(
            (100, 2))]
    else:
        y = np.array(["a", "b", "c"])[np.random.randint(0, 3, 100)]

    model = OutSamplerTransformer(
        RandomForestClassifier(n_estimators=10, random_state=123))
    model.fit(X, y)

    p1 = model.model.predict_proba(X)
    p2 = model.transform(X)

    if multi_output:
        assert isinstance(p1, list)
        assert len(p1) == y.shape[1]
        assert p2.shape == (100, 6)

        assert np.abs(p1[0] - p2[:, 0:3]).max() <= 10**(-10)
        assert np.abs(p1[1] - p2[:, 3:]).max() <= 10**(-10)
    else:
        assert p1.shape == (100, 3)
        assert p2.shape == (100, 3)

        assert np.abs(p1 - p2).max() <= 10**(-10)
        assert model.get_feature_names() == [
            "RandomForestClassifier__a",
            "RandomForestClassifier__b",
            "RandomForestClassifier__c",
        ]
示例#4
0
def test_approx_cross_validation_OutSamplerTransformer_classifier(
        multi_output):

    np.random.seed(123)
    X = np.random.randn(100, 10)
    if multi_output:
        y = 1 * (np.random.randn(100, 2) > 0)
    else:
        y = 1 * (np.random.randn(100) > 0)

    model = OutSamplerTransformer(RandomForestClassifier(n_estimators=10,
                                                         random_state=123),
                                  cv=10)

    cv_res, yhat = model.approx_cross_validation(X,
                                                 y,
                                                 cv=10,
                                                 method="transform",
                                                 no_scoring=True)

    assert cv_res is None
    assert yhat.ndim == 2
    assert yhat.shape == (y.shape[0], 1 + 1 * multi_output)

    with pytest.raises(NotFittedError):
        model.transform(X)

    with pytest.raises(NotFittedError):
        model.model.predict(X)

    cv = KFold(n_splits=10, shuffle=True, random_state=123)
    model = OutSamplerTransformer(RandomForestClassifier(n_estimators=10,
                                                         random_state=123),
                                  cv=cv)
    yhat1 = model.fit_transform(X, y)

    model = OutSamplerTransformer(RandomForestClassifier(n_estimators=10,
                                                         random_state=123),
                                  cv=cv)
    cv_res, yhat2 = model.approx_cross_validation(X,
                                                  y,
                                                  cv=cv,
                                                  method="transform",
                                                  no_scoring=True,
                                                  return_predict=True)

    # Approx cross val and fit transform should return the same thing here
    assert np.abs((yhat1 - yhat2).flatten()).max() <= 10**(-5)

    yhat3 = np.zeros((y.shape[0], 1 + 1 * multi_output), dtype=yhat2.dtype)

    for train, test in cv.split(X, y):
        model = RandomForestClassifier(n_estimators=10, random_state=123)
        model.fit(X[train, :], y[train])

        if multi_output:
            proba = model.predict_proba(X[test, :])
            yhat3[test, 0] = proba[0][:, 1]
            yhat3[test, 1] = proba[1][:, 1]
        else:
            yhat3[test, 0] = model.predict_proba(X[test, :])[:, 1]

    assert np.abs((yhat1 - yhat3).flatten()).max() <= 10**(-5)
    assert np.abs((yhat1 - yhat2).flatten()).max() <= 10**(-5)
示例#5
0
def test_approx_cross_validation_OutSamplerTransformer_regressor(multi_output):

    np.random.seed(123)
    X = np.random.randn(100, 10)
    if multi_output:
        y = np.random.randn(100, 2)
    else:
        y = np.random.randn(100)

    model = OutSamplerTransformer(RandomForestRegressor(n_estimators=10,
                                                        random_state=123),
                                  cv=10)

    cv_res, yhat = model.approx_cross_validation(X,
                                                 y,
                                                 cv=10,
                                                 method="transform",
                                                 no_scoring=True)

    assert cv_res is None
    assert yhat.ndim == 2
    if multi_output:
        assert yhat.shape == y.shape
    else:
        assert yhat.shape == (y.shape[0], 1)

    with pytest.raises(NotFittedError):
        model.transform(X)

    cv = KFold(n_splits=10, shuffle=True, random_state=123)

    model = OutSamplerTransformer(DummyRegressor(), cv=cv)
    yhat1 = model.fit_transform(X, y)

    cv_res, yhat2 = model.approx_cross_validation(X,
                                                  y,
                                                  cv=cv,
                                                  method="transform",
                                                  no_scoring=True,
                                                  return_predict=True)
    # Approx cross val and fit transform should return the same thing here
    assert np.abs((yhat1 - yhat2).flatten()).max() <= 10**(-5)

    if multi_output:
        yhat3 = np.zeros(y.shape)
    else:
        yhat3 = np.zeros((y.shape[0], 1))

    for train, test in cv.split(X, y):
        model = DummyRegressor()
        model.fit(X[train, :], y[train])

        if multi_output:
            yhat3[test, :] = model.predict(X[test, :])
        else:
            yhat3[test, 0] = model.predict(X[test, :])

    assert np.abs((yhat1 - yhat3).flatten()).max() <= 10**(-5)
    assert np.abs((yhat1 - yhat2).flatten()).max() <= 10**(-5)
示例#6
0
def test_OutSamplerTransformer_classifier():

    np.random.seed(123)
    X = np.random.randn(100, 10)
    y = 1 * (np.random.randn(100) > 0)

    model = OutSamplerTransformer(
        RandomForestClassifier(n_estimators=10, random_state=123))
    model.fit(X, y)

    p1 = model.model.predict_proba(X)
    p2 = model.transform(X)

    assert not is_classifier(model)
    assert not is_regressor(model)

    assert np.abs(p1[:, 1] - p2[:, 0]).max() <= 10**(-10)
    assert p2.shape == (100, 1)

    assert model.get_feature_names() == ["RandomForestClassifier__1"]

    y = np.array(["a", "b", "c"])[np.random.randint(0, 3, 100)]

    model = OutSamplerTransformer(
        RandomForestClassifier(n_estimators=10, random_state=123))
    model.fit(X, y)

    p1 = model.model.predict_proba(X)
    p2 = model.transform(X)

    assert p1.shape == (100, 3)
    assert p2.shape == (100, 3)

    assert np.abs(p1 - p2).max() <= 10**(-10)

    assert model.get_feature_names() == [
        "RandomForestClassifier__a",
        "RandomForestClassifier__b",
        "RandomForestClassifier__c",
    ]