示例#1
0
def test_pipeline_sub_estimators():
    iris = load_iris()
    X, y = iris.data, iris.target

    scaling = Pipeline([('transform', ScalingTransformer())])

    pipe = Pipeline([('setup', None), ('missing', None), ('scaling', scaling),
                     ('svc', SVC(kernel='linear', random_state=0))])

    param_grid = [
        {
            'svc__C': [0.1, 0.1]
        },  # Duplicates to test culling
        {
            'setup': [None],
            'svc__C': [0.1, 1, 10],
            'scaling': [ScalingTransformer(), None]
        },
        {
            'setup': [SelectKBest()],
            'setup__k': [1, 2],
            'svc': [
                SVC(kernel='linear', random_state=0, C=0.1),
                SVC(kernel='linear', random_state=0, C=1),
                SVC(kernel='linear', random_state=0, C=10)
            ]
        }
    ]

    gs = GridSearchCV(pipe, param_grid=param_grid, return_train_score=True)
    gs.fit(X, y)
    dgs = dcv.GridSearchCV(pipe,
                           param_grid=param_grid,
                           scheduler='sync',
                           return_train_score=True)
    dgs.fit(X, y)

    # Check best params match
    assert gs.best_params_ == dgs.best_params_

    # Check cv results match
    res = pd.DataFrame(dgs.cv_results_)
    sol = pd.DataFrame(gs.cv_results_)

    # TODO: Failures on Py36 / sklearn dev with order here.
    res = res.reindex(columns=sol.columns)

    pd.util.testing.assert_index_equal(res.columns, sol.columns)
    skip = [
        'mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time'
    ]
    res = res.drop(skip, axis=1)
    sol = sol.drop(skip, axis=1)
    assert res.equals(sol)

    # Check SVC coefs match
    np.testing.assert_allclose(gs.best_estimator_.named_steps['svc'].coef_,
                               dgs.best_estimator_.named_steps['svc'].coef_)
示例#2
0
def test_pipeline_feature_union():
    iris = load_iris()
    X, y = iris.data, iris.target

    pca = PCA(random_state=0)
    kbest = SelectKBest()
    empty_union = FeatureUnion([("first", None), ("second", None)])
    empty_pipeline = Pipeline([("first", None), ("second", None)])
    scaling = Pipeline([("transform", ScalingTransformer())])
    svc = SVC(kernel="linear", random_state=0)

    pipe = Pipeline([
        ("empty_pipeline", empty_pipeline),
        ("scaling", scaling),
        ("missing", None),
        (
            "union",
            FeatureUnion(
                [
                    ("pca", pca),
                    ("missing", None),
                    ("kbest", kbest),
                    ("empty_union", empty_union),
                ],
                transformer_weights={"pca": 0.5},
            ),
        ),
        ("svc", svc),
    ])

    param_grid = dict(
        scaling__transform__factor=[1, 2],
        union__pca__n_components=[1, 2, 3],
        union__kbest__k=[1, 2],
        svc__C=[0.1, 1, 10],
    )

    gs = GridSearchCV(pipe, param_grid=param_grid, cv=3, iid=True)
    gs.fit(X, y)
    dgs = dcv.GridSearchCV(pipe, param_grid=param_grid, scheduler="sync", cv=3)
    dgs.fit(X, y)

    # Check best params match
    assert gs.best_params_ == dgs.best_params_

    # Check PCA components match
    sk_pca = gs.best_estimator_.named_steps["union"].transformer_list[0][1]
    dk_pca = dgs.best_estimator_.named_steps["union"].transformer_list[0][1]
    np.testing.assert_allclose(sk_pca.components_, dk_pca.components_)

    # Check SelectKBest scores match
    sk_kbest = gs.best_estimator_.named_steps["union"].transformer_list[2][1]
    dk_kbest = dgs.best_estimator_.named_steps["union"].transformer_list[2][1]
    np.testing.assert_allclose(sk_kbest.scores_, dk_kbest.scores_)

    # Check SVC coefs match
    np.testing.assert_allclose(
        gs.best_estimator_.named_steps["svc"].coef_,
        dgs.best_estimator_.named_steps["svc"].coef_,
    )
示例#3
0
def test_pipeline_feature_union():
    iris = load_iris()
    X, y = iris.data, iris.target

    pca = PCA(random_state=0)
    kbest = SelectKBest()
    empty_union = FeatureUnion([('first', None), ('second', None)])
    empty_pipeline = Pipeline([('first', None), ('second', None)])
    scaling = Pipeline([('transform', ScalingTransformer())])
    svc = SVC(kernel='linear', random_state=0)

    pipe = Pipeline([('empty_pipeline', empty_pipeline), ('scaling', scaling),
                     ('missing', None),
                     ('union',
                      FeatureUnion([('pca', pca), ('missing', None),
                                    ('kbest', kbest),
                                    ('empty_union', empty_union)],
                                   transformer_weights={'pca': 0.5})),
                     ('svc', svc)])

    param_grid = dict(scaling__transform__factor=[1, 2],
                      union__pca__n_components=[1, 2, 3],
                      union__kbest__k=[1, 2],
                      svc__C=[0.1, 1, 10])

    gs = GridSearchCV(pipe, param_grid=param_grid)
    gs.fit(X, y)
    dgs = dcv.GridSearchCV(pipe, param_grid=param_grid, scheduler='sync')
    dgs.fit(X, y)

    # Check best params match
    assert gs.best_params_ == dgs.best_params_

    # Check PCA components match
    sk_pca = gs.best_estimator_.named_steps['union'].transformer_list[0][1]
    dk_pca = dgs.best_estimator_.named_steps['union'].transformer_list[0][1]
    np.testing.assert_allclose(sk_pca.components_, dk_pca.components_)

    # Check SelectKBest scores match
    sk_kbest = gs.best_estimator_.named_steps['union'].transformer_list[2][1]
    dk_kbest = dgs.best_estimator_.named_steps['union'].transformer_list[2][1]
    np.testing.assert_allclose(sk_kbest.scores_, dk_kbest.scores_)

    # Check SVC coefs match
    np.testing.assert_allclose(gs.best_estimator_.named_steps['svc'].coef_,
                               dgs.best_estimator_.named_steps['svc'].coef_)
示例#4
0
def test_feature_union(weights):
    X = np.ones((10, 5))
    y = np.zeros(10)

    union = FeatureUnion(
        [
            ("tr0", ScalingTransformer()),
            ("tr1", ScalingTransformer()),
            ("tr2", ScalingTransformer()),
        ]
    )

    factors = [(2, 3, 5), (2, 4, 5), (2, 4, 6), (2, 4, None), (None, None, None)]
    params, sols, grid = [], [], []
    for constants, w in product(factors, weights or [None]):
        p = {}
        for n, c in enumerate(constants):
            if c is None:
                p["tr%d" % n] = None
            elif n == 3:  # 3rd is always an estimator
                p["tr%d" % n] = ScalingTransformer(c)
            else:
                p["tr%d__factor" % n] = c
        sol = union.set_params(transformer_weights=w, **p).transform(X)
        sols.append(sol)
        if w is not None:
            p["transformer_weights"] = w
        params.append(p)
        p2 = {"union__" + k: [v] for k, v in p.items()}
        p2["est"] = [CheckXClassifier(sol[0])]
        grid.append(p2)

    # Need to recreate the union after setting estimators to `None` above
    union = FeatureUnion(
        [
            ("tr0", ScalingTransformer()),
            ("tr1", ScalingTransformer()),
            ("tr2", ScalingTransformer()),
        ]
    )

    pipe = Pipeline([("union", union), ("est", CheckXClassifier())])
    gs = dcv.GridSearchCV(pipe, grid, refit=False, cv=2)

    with warnings.catch_warnings(record=True):
        gs.fit(X, y)
示例#5
0
def test_pipeline_sub_estimators():
    iris = load_iris()
    X, y = iris.data, iris.target

    scaling = Pipeline([("transform", ScalingTransformer())])

    pipe = Pipeline([
        ("setup", None),
        ("missing", None),
        ("scaling", scaling),
        ("svc", SVC(kernel="linear", random_state=0)),
    ])

    param_grid = [
        {
            "svc__C": [0.1, 0.1]
        },  # Duplicates to test culling
        {
            "setup": [None],
            "svc__C": [0.1, 1, 10],
            "scaling": [ScalingTransformer(), None],
        },
        {
            "setup": [SelectKBest()],
            "setup__k": [1, 2],
            "svc": [
                SVC(kernel="linear", random_state=0, C=0.1),
                SVC(kernel="linear", random_state=0, C=1),
                SVC(kernel="linear", random_state=0, C=10),
            ],
        },
    ]

    gs = GridSearchCV(pipe,
                      param_grid=param_grid,
                      return_train_score=True,
                      cv=3,
                      **iid)
    gs.fit(X, y)
    dgs = dcv.GridSearchCV(pipe,
                           param_grid=param_grid,
                           scheduler="sync",
                           return_train_score=True,
                           cv=3)
    dgs.fit(X, y)

    # Check best params match
    assert gs.best_params_ == dgs.best_params_

    # Check cv results match
    res = pd.DataFrame(dgs.cv_results_)
    sol = pd.DataFrame(gs.cv_results_)

    # TODO: Failures on Py36 / sklearn dev with order here.
    res = res.reindex(columns=sol.columns)

    pd.util.testing.assert_index_equal(res.columns, sol.columns)
    skip = [
        "mean_fit_time", "std_fit_time", "mean_score_time", "std_score_time"
    ]
    res = res.drop(skip, axis=1)
    sol = sol.drop(skip, axis=1)
    pd.util.testing.assert_frame_equal(res,
                                       sol,
                                       check_exact=False,
                                       check_less_precise=1)

    # Check SVC coefs match
    np.testing.assert_allclose(
        gs.best_estimator_.named_steps["svc"].coef_,
        dgs.best_estimator_.named_steps["svc"].coef_,
    )