def test_feature_engineering():
    """Tests feature engineering"""
    prob = mock_problem()

    classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), feature_engineering=PCA(n_components=2))
    model = classifier.fit(prob).model
    steps = dict(model.steps)

    nose.tools.ok_('PCA' in str(classifier))
    nose.tools.ok_('feature_engineering' in steps)
    nose.tools.assert_is_not_none(steps['feature_engineering'].components_)

    # Check that classifier.apply() works
    nose.tools.eq_(len(classifier.apply(prob)), prob.n_samples)

    # Test that SelectAndClassify still works without feature engineering
    classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression())
    model = classifier.fit(prob).model
    steps = dict(model.steps)
    nose.tools.ok_('PCA' not in str(classifier))
    nose.tools.ok_('feature_engineering' not in steps)
 def checkme(selector_grid, classifier_grid, optimal_params):
     """Utility: runs grid search and verifies that we selected the right parameters"""
     prob = mock_problem()
     learner = SelectAndClassify(SelectKBest(), LogisticRegression(), selector_grid=selector_grid,
                                 classifier_grid=classifier_grid,
                                 grid_search_scorer=make_test_grid_scorer(optimal_params),
                                 grid_search_cv_folds=2, grid_search_cv_repartitions=1,
                                 randomized_grid_size_cutoff=None)
     model_params = learner.fit(prob).model.get_params()
     params_to_check = sorted(optimal_params.keys())
     nose.tools.assert_list_equal([(k, model_params[k]) for k in params_to_check],
                                  [(k, optimal_params[k]) for k in params_to_check])
    def checkme(optimal_params):
        """Utility: runs grid search and verifies that we selected (approximately) the right parameters"""
        np.random.seed(0xC0FFEE)
        prob = mock_problem(n_samples=100)
        learner = SelectAndClassify(SelectKBest(), LogisticRegression(), selector_grid={'k': [10, 20]},
                                    classifier_grid={'C': np.linspace(0.5, 1.0, 1000)},
                                    grid_search_scorer=make_test_grid_scorer(optimal_params),
                                    grid_search_cv_folds=2, grid_search_cv_repartitions=1,
                                    randomized_grid_size_cutoff=100)
        model_params = learner.fit(prob).model.get_params()
        for param_name in sorted(optimal_params.keys()):
            # Might not be exactly optimal, but should be close
            tolerance = 0.05 * abs(optimal_params[param_name])
            nose.tools.assert_almost_equal(model_params[param_name], optimal_params[param_name],
                                           delta=tolerance)

        error_log.append(make_test_grid_scorer(optimal_params)(learner.model, prob.X, prob.y))