示例#1
0
    def test_max_depth(self):
        model_list = [(Lasso, {}), (Ridge, {}), (RandomForestRegressor, {})]

        max_depth = 2
        addition = 0.25
        epochs = 20

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        context, pipeline_data = LocalExecutor(data, epochs) << (
            Pipeline() >> ModelSpace(model_list) >> FormulaFeatureGenerator(
                ['+', '-', '*', '/'], max_depth=max_depth, addition=addition)
            >> Validate(test_size=0.33, metrics=mean_squared_error) >>
            ChooseBest(1) >> FeatureSelector(30))

        depths = [feature["depth"] for feature in pipeline_data.dataset.meta]

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(
            pipeline_data.dataset,
            Dataset(datasets.load_boston().data,
                    datasets.load_boston().target))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
        self.assertLessEqual(max(depths), max_depth + addition * epochs)
示例#2
0
    def test_RFS(self):
        x, y = make_classification(n_samples=100,
                                   n_features=40,
                                   n_informative=2,
                                   n_redundant=10,
                                   flip_y=0.05)

        model_list = [(RandomForestClassifier, {}),
                      (GradientBoostingClassifier, {}), (SVC, {}),
                      (KNeighborsClassifier, {}), (XGBClassifier, {})]

        n_features_to_select = random.randint(5, 30)

        data = Dataset(x, y)
        context, pipeline_data = LocalExecutor(
            data, 2) << (Pipeline() >> PipelineStep(
                'model space', ModelSpace(model_list), initializer=True) >>
                         FormulaFeatureGenerator(['+', '-', '*', '/']) >>
                         Validate(test_size=0.1, metrics=roc_auc_score) >>
                         ChooseBest(1) >> RecursiveFeatureSelector(
                             n_features_to_select=n_features_to_select))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(pipeline_data.dataset,
                                             Dataset(x, y))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
示例#3
0
    def test_correlated_feature_generator(self):

        model_list = [(RandomForestRegressor, {}),
                      (GradientBoostingRegressor, {}), (SVR, {}),
                      (XGBRegressor, {})]

        n_features_to_select = random.randint(5, 30)

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        context, pipeline_data = LocalExecutor(data, 5) << (
            Pipeline() >> PipelineStep(
                'model space', ModelSpace(model_list), initializer=True) >>
            FormulaFeatureGenerator(['+', '-', '*', '/'], limit=10) >>
            Validate(test_size=0.1, metrics=mean_absolute_error) >> ChooseBest(
                1, by_largest_score=False) >>
            RecursiveFeatureSelector(n_features_to_select=n_features_to_select)
            >> CorrelatedFeatureSelector(max_correlation=0.9))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(
            pipeline_data.dataset,
            Dataset(datasets.load_boston().data,
                    datasets.load_boston().target))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
示例#4
0
    def test_voting_feature_selector(self):
        x, y = make_regression(
            n_samples=100,
            n_features=40,
            n_informative=2,
        )

        model_list = [(RandomForestRegressor, {}),
                      (GradientBoostingRegressor, {}), (SVR, {}),
                      (XGBRegressor, {})]

        data = Dataset(x, y)

        result_mult = []
        result_div = []
        context, pipeline_data = LocalExecutor(data, 10) << (
            Pipeline() >> PipelineStep(
                'model space', ModelSpace(model_list), initializer=True) >>
            FormulaFeatureGenerator(['+', '-', '*', '/']) >> Validate(
                test_size=0.1, metrics=mean_absolute_error) >> ChooseBest(
                    4, by_largest_score=False) >> VotingFeatureSelector(
                        feature_to_select=10, reverse_score=True))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(pipeline_data.dataset,
                                             Dataset(x, y))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())

        print('0' * 30)
        for result in pipeline_data.return_val:
            print(result.model, result.score)
        print(pipeline_data.dataset.data.shape)
        print('0' * 30)
示例#5
0
    def test_pipeline_hyperopt(self):
        x, y = make_classification(n_samples=100,
                                   n_features=40,
                                   n_informative=2,
                                   n_redundant=10,
                                   flip_y=0.05)
        model_list = [(RandomForestClassifier, random_forest_hp_space()),
                      (GradientBoostingClassifier, grad_boosting_hp_space()),
                      (SVC, svc_kernel_hp_space('rbf')),
                      (KNeighborsClassifier, knn_hp_space()),
                      (XGBClassifier, xgboost_hp_space())]

        data = Dataset(x, y)
        context, pipeline_data = LocalExecutor(data, 2) << (
            Pipeline() >> PipelineStep(
                'model space', ModelSpace(model_list),
                initializer=True) >> FormulaFeatureGenerator(['+', '-', '*'])
            >> Hyperopt(Validate(test_size=0.1, metrics=roc_auc_score),
                        max_evals=2) >> ChooseBest(1) >> FeatureSelector(10))

        print('0' * 30)
        for result in pipeline_data.return_val:
            print(result.model, result.score)
        print(pipeline_data.dataset.data.shape)
        print('0' * 30)
示例#6
0
    def test_pipeline(self):
        df = pd.DataFrame([[1, 2], [3, 4]])
        X = Dataset(df, None)
        poly = PolynomialGenerator(interaction_only=True, degree=4)
        context, pipe_output = LocalExecutor(X) << (
            Pipeline() >> PipelineStep('generate_features', poly))

        self.assertEqual(pipe_output.dataset.data.shape, (2, 4))
示例#7
0
    def test_recovering_dataset_FFG(self):
        model_list = [(Lasso, {}), (Ridge, {}), (RandomForestRegressor, {})]

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        context, pipeline_data = LocalExecutor(
            data, 10) << (Pipeline() >> ModelSpace(model_list) >>
                          FormulaFeatureGenerator(['+', '-', '*']) >> Validate(
                              test_size=0.33, metrics=mean_squared_error) >>
                          ChooseBest(1) >> FeatureSelector(30))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(
            pipeline_data.dataset,
            Dataset(datasets.load_boston().data,
                    datasets.load_boston().target))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
示例#8
0
 def test_generate_formula_feature(self):
     features = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
     df = pd.DataFrame(features)
     X = PipelineData(Dataset(df, None))
     limit = random.randint(0, 100)
     gen = FormulaFeatureGenerator(['+', '*', '/', '-'], limit)
     context = PipelineContext()
     result_size = gen(X, context).dataset.data.shape[1]
     self.assertLessEqual(result_size, np.array(features).shape[1] + limit)
示例#9
0
 def test_xgboost(self):
     max_evals = 2
     x, y = make_classification()
     dataset = Dataset(x, y)
     result = LocalExecutor(dataset) << (Pipeline() >> ModelSpace([
         (XGBClassifier, xgboost_hp_space())
     ]) >> Hyperopt(CV('roc_auc'), max_evals=max_evals))
     result = result[1].return_val[0]
     self.assertIsInstance(result, HyperparameterSearchResult)
     self.assertEqual(len(result.history), max_evals)
示例#10
0
 def test_hyperopt(self):
     max_evals = 2
     x, y = make_classification()
     dataset = Dataset(x, y)
     result = LocalExecutor(dataset) << (Pipeline() >> ModelSpace([
         (RandomForestClassifier, random_forest_hp_space())
     ]) >> Hyperopt(CV('roc_auc'), max_evals=max_evals))
     result = result[1].return_val[0]
     self.assertIsInstance(result.history, hyperopt.base.Trials)
     self.assertEqual(len(result.history), max_evals)
示例#11
0
    def test_random_choice_combinator(self):
        for _ in range(0, 10):
            data = Dataset(np.zeros((2, 2)), np.zeros((2, 2)))
            result = LocalExecutor() << (Pipeline() >> RandomChoice([
                PipelineStep('a', lambda x, context: 'a'),
                PipelineStep('b', lambda x, context: 'b')
            ]))

            print(result)
            self.assertIn(result[1], ['a', 'b'])
示例#12
0
    def test_poly_gen(self):
        model_list = [
            (Lasso, {}),
            #(Ridge, {}),
            (RandomForestRegressor, {})
        ]

        X, y = datasets.make_regression(n_features=5)

        data = Dataset(X, y)
        context, pipeline_data = LocalExecutor(
            data, 10) << (Pipeline() >> ModelSpace(model_list) >>
                          PolynomialFeatureGenerator(max_degree=3) >> Validate(
                              test_size=0.33, metrics=mean_squared_error) >>
                          ChooseBest(1) >> FeatureSelector(10))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(pipeline_data.dataset,
                                             Dataset(X, y))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
示例#13
0
    def test_step_space_regression_model(self):
        model_list = [
            (Lasso, {}),
            (Ridge, {}),
            (KernelRidge, {}),
        ]

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        LocalExecutor(data) << (
            Pipeline() >> ModelSpace(model_list) >> Validate(
                test_size=0.33, metrics=mean_absolute_error) >> ChooseBest(3))
示例#14
0
    def test_forest(self):
        model_list = [
            #(RandomForestRegressor, random_forest_hp_space('mae')),
            (RandomForestClassifier, random_forest_hp_space())
        ]

        data = Dataset(datasets.load_iris().data, datasets.load_iris().target)
        context, pipeline_data = LocalExecutor(data, 1) << (
            Pipeline() >> PipelineStep('model space', ModelSpace(model_list))
            >> PipelineStep(
                'H',
                Hyperopt(Validate(test_size=0.33, metrics=mean_absolute_error),
                         max_evals=2)))
示例#15
0
    def test_step_cv(self):
        model_list = [
            (LogisticRegression, {}),
            (RandomForestClassifier, {
                'n_estimators': 100
            }),
            (GradientBoostingClassifier, {}),
            (SVC, {}),
            (KNeighborsClassifier, {}),
        ]

        data = Dataset(datasets.load_iris().data, datasets.load_iris().target)
        LocalExecutor(data) << (Pipeline() >> ModelSpace(model_list) >>
                                CV('accuracy') >> ChooseBest(3))
示例#16
0
    def test_cv(self):
        dataset = PipelineData(
            Dataset(datasets.load_iris().data,
                    datasets.load_iris().target))

        cv = CV('accuracy', n_folds=5)
        self.assertAlmostEqual(
            cv(dataset, (RandomForestClassifier, {
                'random_state': 1
            })).score,
            cross_val_score(RandomForestClassifier(random_state=1),
                            dataset.dataset.data,
                            dataset.dataset.target,
                            cv=5).mean())
示例#17
0
    def test_call_generator(self):
        Transformer = Mock()
        Transformer.fit_transform.return_value = []

        df = pd.DataFrame([[1, 2], [3, 4]])
        X = PipelineData(Dataset(df, None))
        context = PipelineContext()

        transformer = lambda *args, **kwargs: Transformer
        gen = SklearnFeatureGenerator(transformer)
        gen(X, context)
        Transformer.fit_transform.assert_called()
        self.assertTrue((
            Transformer.fit_transform.call_args[0][0] == df.as_matrix()).all())
示例#18
0
    def test_all_step(self):
        model_list = [
            #(Lasso, {}),
            #(Ridge, {}),
            #(KernelRidge, {}),
            (RandomForestRegressor, {}),
            (XGBRegressor, {})
        ]

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        context, pipeline_data = LocalExecutor(data, 10) << (
            Pipeline() >> ModelSpace(model_list) >> FormulaFeatureGenerator([
                '+', '-', '*'
            ]) >> Validate(test_size=0.33, metrics=mean_squared_error) >>
            ChooseBest(1, by_largest_score=False) >> FeatureSelector(20))

        print('0' * 30)
        for result in pipeline_data.return_val:
            print(result.model, result.score)
        print(pipeline_data.dataset.data.shape)
        print('0' * 30)