Пример #1
0
    def flow_to_sklearn_with_hack(self, flow):
        copyFlow = flow
        if copyFlow.flow_id == 5804:
            copyFlow.dependencies = u'sklearn==0.20.3\nnumpy>=1.13.0\nscipy>=1.0'
            for v in copyFlow.components.values():
                v.dependencies = u'sklearn==0.20.3\nnumpy>=1.13.0\nscipy>=1.0'

            return flows.flow_to_sklearn(copyFlow)

        else:
            return flows.flow_to_sklearn(copyFlow)
Пример #2
0
    def test_serialize_feature_union(self):
        ohe = sklearn.preprocessing.OneHotEncoder(sparse=False)
        scaler = sklearn.preprocessing.StandardScaler()
        fu = sklearn.pipeline.FeatureUnion(
            transformer_list=[('ohe', ohe), ('scaler', scaler)])
        serialization = sklearn_to_flow(fu)
        self.assertEqual(
            serialization.name, 'sklearn.pipeline.FeatureUnion('
            'ohe=sklearn.preprocessing.data.OneHotEncoder,'
            'scaler=sklearn.preprocessing.data.StandardScaler)')
        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(fu))
        self.assertIsNot(new_model, fu)
        self.assertEqual(new_model.transformer_list[0][0],
                         fu.transformer_list[0][0])
        self.assertEqual(new_model.transformer_list[0][1].get_params(),
                         fu.transformer_list[0][1].get_params())
        self.assertEqual(new_model.transformer_list[1][0],
                         fu.transformer_list[1][0])
        self.assertEqual(new_model.transformer_list[1][1].get_params(),
                         fu.transformer_list[1][1].get_params())

        self.assertEqual([step[0] for step in new_model.transformer_list],
                         [step[0] for step in fu.transformer_list])
        self.assertIsNot(new_model.transformer_list[0][1],
                         fu.transformer_list[0][1])
        self.assertIsNot(new_model.transformer_list[1][1],
                         fu.transformer_list[1][1])

        new_model_params = new_model.get_params()
        del new_model_params['ohe']
        del new_model_params['scaler']
        del new_model_params['transformer_list']
        fu_params = fu.get_params()
        del fu_params['ohe']
        del fu_params['scaler']
        del fu_params['transformer_list']

        self.assertEqual(new_model_params, fu_params)
        new_model.fit(self.X, self.y)

        fu.set_params(scaler=None)
        serialization = sklearn_to_flow(fu)
        self.assertEqual(
            serialization.name, 'sklearn.pipeline.FeatureUnion('
            'ohe=sklearn.preprocessing.data.OneHotEncoder)')
        new_model = flow_to_sklearn(serialization)
        self.assertEqual(type(new_model), type(fu))
        self.assertIsNot(new_model, fu)
        self.assertIs(new_model.transformer_list[1][1], None)
Пример #3
0
    def test_serialize_feature_union(self):
        ohe = sklearn.preprocessing.OneHotEncoder(sparse=False)
        scaler = sklearn.preprocessing.StandardScaler()
        fu = sklearn.pipeline.FeatureUnion(transformer_list=[('ohe', ohe),
                                                             ('scaler', scaler)])
        serialization =  sklearn_to_flow(fu)
        self.assertEqual(serialization.name,
                         'sklearn.pipeline.FeatureUnion('
                         'ohe=sklearn.preprocessing.data.OneHotEncoder,'
                         'scaler=sklearn.preprocessing.data.StandardScaler)')
        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(fu))
        self.assertIsNot(new_model, fu)
        self.assertEqual(new_model.transformer_list[0][0],
                         fu.transformer_list[0][0])
        self.assertEqual(new_model.transformer_list[0][1].get_params(),
                         fu.transformer_list[0][1].get_params())
        self.assertEqual(new_model.transformer_list[1][0],
                         fu.transformer_list[1][0])
        self.assertEqual(new_model.transformer_list[1][1].get_params(),
                         fu.transformer_list[1][1].get_params())

        self.assertEqual([step[0] for step in new_model.transformer_list],
                         [step[0] for step in fu.transformer_list])
        self.assertIsNot(new_model.transformer_list[0][1], fu.transformer_list[0][1])
        self.assertIsNot(new_model.transformer_list[1][1], fu.transformer_list[1][1])

        new_model_params = new_model.get_params()
        del new_model_params['ohe']
        del new_model_params['scaler']
        del new_model_params['transformer_list']
        fu_params = fu.get_params()
        del fu_params['ohe']
        del fu_params['scaler']
        del fu_params['transformer_list']

        self.assertEqual(new_model_params, fu_params)
        new_model.fit(self.X, self.y)

        fu.set_params(scaler=None)
        serialization = sklearn_to_flow(fu)
        self.assertEqual(serialization.name,
                         'sklearn.pipeline.FeatureUnion('
                         'ohe=sklearn.preprocessing.data.OneHotEncoder)')
        new_model = flow_to_sklearn(serialization)
        self.assertEqual(type(new_model), type(fu))
        self.assertIsNot(new_model, fu)
        self.assertIs(new_model.transformer_list[1][1], None)
Пример #4
0
    def test_serialize_complex_flow(self):
        ohe = sklearn.preprocessing.OneHotEncoder(categorical_features=[0])
        scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        boosting = sklearn.ensemble.AdaBoostClassifier(
            base_estimator=sklearn.tree.DecisionTreeClassifier())
        model = sklearn.pipeline.Pipeline(steps=(
            ('ohe', ohe), ('scaler', scaler), ('boosting', boosting)))
        parameter_grid = {'n_estimators': [1, 5, 10, 100],
                          'learning_rate': scipy.stats.uniform(0.01, 0.99),
                          'base_estimator__max_depth': scipy.stats.randint(1,
                                                                           10)}
        cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
        rs = sklearn.model_selection.RandomizedSearchCV(
            estimator=model, param_distributions=parameter_grid, cv=cv)
        serialized = sklearn_to_flow(rs)

        fixture_name = 'sklearn.model_selection._search.RandomizedSearchCV(' \
                       'estimator=sklearn.pipeline.Pipeline(' \
                       'ohe=sklearn.preprocessing.data.OneHotEncoder,' \
                       'scaler=sklearn.preprocessing.data.StandardScaler,' \
                       'boosting=sklearn.ensemble.weight_boosting.AdaBoostClassifier(' \
                       'base_estimator=sklearn.tree.tree.DecisionTreeClassifier)))'
        self.assertEqual(serialized.name, fixture_name)

        # now do deserialization
        deserialized = flow_to_sklearn(serialized)

        # Checks that sklearn_to_flow is idempotent.
        serialized2 = sklearn_to_flow(deserialized)
        self.assertNotEqual(rs, deserialized)
        # Would raise an exception if the flows would be unequal
        assert_flows_equal(serialized, serialized2)
Пример #5
0
    def test_serialize_complex_flow(self):
        ohe = sklearn.preprocessing.OneHotEncoder(categorical_features=[0])
        scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        boosting = sklearn.ensemble.AdaBoostClassifier(
            base_estimator=sklearn.tree.DecisionTreeClassifier())
        model = sklearn.pipeline.Pipeline(steps=(('ohe', ohe), ('scaler',
                                                                scaler),
                                                 ('boosting', boosting)))
        parameter_grid = {
            'n_estimators': [1, 5, 10, 100],
            'learning_rate': scipy.stats.uniform(0.01, 0.99),
            'base_estimator__max_depth': scipy.stats.randint(1, 10)
        }
        cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
        rs = sklearn.model_selection.RandomizedSearchCV(
            estimator=model, param_distributions=parameter_grid, cv=cv)
        serialized = sklearn_to_flow(rs)

        fixture_name = 'sklearn.model_selection._search.RandomizedSearchCV(' \
                       'estimator=sklearn.pipeline.Pipeline(' \
                       'ohe=sklearn.preprocessing.data.OneHotEncoder,' \
                       'scaler=sklearn.preprocessing.data.StandardScaler,' \
                       'boosting=sklearn.ensemble.weight_boosting.AdaBoostClassifier(' \
                       'base_estimator=sklearn.tree.tree.DecisionTreeClassifier)))'
        self.assertEqual(serialized.name, fixture_name)

        # now do deserialization
        deserialized = flow_to_sklearn(serialized)

        # Checks that sklearn_to_flow is idempotent.
        serialized2 = sklearn_to_flow(deserialized)
        self.assertNotEqual(rs, deserialized)
        # Would raise an exception if the flows would be unequal
        assert_flows_equal(serialized, serialized2)
Пример #6
0
    def test_serialize_cvobject(self):
        methods = [
            sklearn.model_selection.KFold(3),
            sklearn.model_selection.LeaveOneOut()
        ]
        fixtures = [
            OrderedDict([
                ('oml-python:serialized_object', 'cv_object'),
                ('value',
                 OrderedDict([('name', 'sklearn.model_selection._split.KFold'),
                              ('parameters',
                               OrderedDict([('n_splits', '3'),
                                            ('random_state', 'null'),
                                            ('shuffle', 'false')]))]))
            ]),
            OrderedDict([
                ('oml-python:serialized_object', 'cv_object'),
                ('value',
                 OrderedDict([('name',
                               'sklearn.model_selection._split.LeaveOneOut'),
                              ('parameters', OrderedDict())]))
            ])
        ]
        for method, fixture in zip(methods, fixtures):
            m = sklearn_to_flow(method)
            self.assertEqual(m, fixture)

            m_new = flow_to_sklearn(m)
            self.assertIsNot(m_new, m)
            self.assertIsInstance(m_new, type(method))
Пример #7
0
    def test_serialize_type(self):
        supported_types = [float, np.float, np.float32, np.float64,
                           int, np.int, np.int32, np.int64]

        for supported_type in supported_types:
            serialized = sklearn_to_flow(supported_type)
            deserialized = flow_to_sklearn(serialized)
            self.assertEqual(deserialized, supported_type)
Пример #8
0
 def test_serialize_resampling(self):
     kfold = sklearn.model_selection.StratifiedKFold(
         n_splits=4, shuffle=True)
     serialized =  sklearn_to_flow(kfold)
     deserialized = flow_to_sklearn(serialized)
     # Best approximation to get_params()
     self.assertEqual(str(deserialized), str(kfold))
     self.assertIsNot(deserialized, kfold)
Пример #9
0
 def test_serialize_resampling(self):
     kfold = sklearn.model_selection.StratifiedKFold(n_splits=4,
                                                     shuffle=True)
     serialized = sklearn_to_flow(kfold)
     deserialized = flow_to_sklearn(serialized)
     # Best approximation to get_params()
     self.assertEqual(str(deserialized), str(kfold))
     self.assertIsNot(deserialized, kfold)
Пример #10
0
    def test_serialize_pipeline(self):
        scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        dummy = sklearn.dummy.DummyClassifier(strategy='prior')
        model = sklearn.pipeline.Pipeline(steps=(
            ('scaler', scaler), ('dummy', dummy)))

        fixture_name = 'sklearn.pipeline.Pipeline(' \
                       'scaler=sklearn.preprocessing.data.StandardScaler,' \
                       'dummy=sklearn.dummy.DummyClassifier)'
        fixture_description = 'Automatically created scikit-learn flow.'

        serialization =  sklearn_to_flow(model)

        self.assertEqual(serialization.name, fixture_name)
        self.assertEqual(serialization.description, fixture_description)

        # Comparing the pipeline
        # The parameters only have the name of base objects(not the whole flow)
        # as value
        self.assertEqual(len(serialization.parameters), 1)
        # Hard to compare two representations of a dict due to possibly
        # different sorting. Making a json makes it easier
        self.assertEqual(json.loads(serialization.parameters['steps']),
                         [{'oml-python:serialized_object':
                               'component_reference', 'value': {'key': 'scaler', 'step_name': 'scaler'}},
                          {'oml-python:serialized_object':
                               'component_reference', 'value': {'key': 'dummy', 'step_name': 'dummy'}}])

        # Checking the sub-component
        self.assertEqual(len(serialization.components), 2)
        self.assertIsInstance(serialization.components['scaler'],
                              OpenMLFlow)
        self.assertIsInstance(serialization.components['dummy'],
                              OpenMLFlow)

        #del serialization.model
        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(model))
        self.assertIsNot(new_model, model)

        self.assertEqual([step[0] for step in new_model.steps],
                         [step[0] for step in model.steps])
        self.assertIsNot(new_model.steps[0][1], model.steps[0][1])
        self.assertIsNot(new_model.steps[1][1], model.steps[1][1])

        new_model_params = new_model.get_params()
        del new_model_params['scaler']
        del new_model_params['dummy']
        del new_model_params['steps']
        fu_params = model.get_params()
        del fu_params['scaler']
        del fu_params['dummy']
        del fu_params['steps']

        self.assertEqual(new_model_params, fu_params)
        new_model.fit(self.X, self.y)
Пример #11
0
    def test_serialize_type(self):
        supported_types = [
            float, np.float, np.float32, np.float64, int, np.int, np.int32,
            np.int64
        ]

        for supported_type in supported_types:
            serialized = sklearn_to_flow(supported_type)
            deserialized = flow_to_sklearn(serialized)
            self.assertEqual(deserialized, supported_type)
Пример #12
0
    def test_hypothetical_parameter_values(self):
        # The hypothetical parameter values of true, 1, 0.1 formatted as a
        # string (and their correct serialization and deserialization) an only
        #  be checked inside a model

        model = Model('true', '1', '0.1')

        serialized = sklearn_to_flow(model)
        deserialized = flow_to_sklearn(serialized)
        self.assertEqual(deserialized.get_params(), model.get_params())
        self.assertIsNot(deserialized, model)
Пример #13
0
    def test_hypothetical_parameter_values(self):
        # The hypothetical parameter values of true, 1, 0.1 formatted as a
        # string (and their correct serialization and deserialization) an only
        #  be checked inside a model

        model = Model('true', '1', '0.1')

        serialized = sklearn_to_flow(model)
        deserialized = flow_to_sklearn(serialized)
        self.assertEqual(deserialized.get_params(), model.get_params())
        self.assertIsNot(deserialized, model)
Пример #14
0
    def test_serialize_advanced_grid(self):
        # TODO instead a GridSearchCV object should be serialized

        # This needs to be in its own function because we cannot simply check
        # for the equality of the grid, because scikit-learn objects don't
        # really support the equality operator
        # This will only work with sklearn==0.18
        N_FEATURES_OPTIONS = [2, 4, 8]
        C_OPTIONS = [1, 10, 100, 1000]
        grid = [{
            'reduce_dim': [
                sklearn.decomposition.PCA(iterated_power=7),
                sklearn.decomposition.NMF()
            ],
            'reduce_dim__n_components':
            N_FEATURES_OPTIONS,
            'classify__C':
            C_OPTIONS
        }, {
            'reduce_dim': [
                sklearn.feature_selection.SelectKBest(
                    sklearn.feature_selection.chi2)
            ],
            'reduce_dim__k':
            N_FEATURES_OPTIONS,
            'classify__C':
            C_OPTIONS
        }]

        serialized = sklearn_to_flow(grid)
        deserialized = flow_to_sklearn(serialized)

        self.assertEqual(grid[0]['reduce_dim'][0].get_params(),
                         deserialized[0]['reduce_dim'][0].get_params())
        self.assertIsNot(grid[0]['reduce_dim'][0],
                         deserialized[0]['reduce_dim'][0])
        self.assertEqual(grid[0]['reduce_dim'][1].get_params(),
                         deserialized[0]['reduce_dim'][1].get_params())
        self.assertIsNot(grid[0]['reduce_dim'][1],
                         deserialized[0]['reduce_dim'][1])
        self.assertEqual(grid[0]['reduce_dim__n_components'],
                         deserialized[0]['reduce_dim__n_components'])
        self.assertEqual(grid[0]['classify__C'],
                         deserialized[0]['classify__C'])
        self.assertEqual(grid[1]['reduce_dim'][0].get_params(),
                         deserialized[1]['reduce_dim'][0].get_params())
        self.assertIsNot(grid[1]['reduce_dim'][0],
                         deserialized[1]['reduce_dim'][0])
        self.assertEqual(grid[1]['reduce_dim__k'],
                         deserialized[1]['reduce_dim__k'])
        self.assertEqual(grid[1]['classify__C'],
                         deserialized[1]['classify__C'])
Пример #15
0
    def test_serialize_rvs(self):
        supported_rvs = [scipy.stats.norm(loc=1, scale=5),
                         scipy.stats.expon(loc=1, scale=5),
                         scipy.stats.randint(low=-3, high=15)]

        for supported_rv in supported_rvs:
            serialized = sklearn_to_flow(supported_rv)
            deserialized = flow_to_sklearn(serialized)
            self.assertEqual(type(deserialized.dist), type(supported_rv.dist))
            del deserialized.dist
            del supported_rv.dist
            self.assertEqual(deserialized.__dict__,
                             supported_rv.__dict__)
Пример #16
0
    def test_serialize_simple_parameter_grid(self):

        # We cannot easily test for scipy random variables in here, but they
        # should be covered

        # Examples from the scikit-learn documentation
        models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()]
        grids = \
            [[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
               'kernel': ['rbf']}],
             {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}]

        for grid, model in zip(grids, models):
            serialized = sklearn_to_flow(grid)
            deserialized = flow_to_sklearn(serialized)

            self.assertEqual(deserialized, grid)
            self.assertIsNot(deserialized, grid)

            hpo = sklearn.model_selection.GridSearchCV(param_grid=grid,
                                                       estimator=model)

            serialized = sklearn_to_flow(hpo)
            deserialized = flow_to_sklearn(serialized)
            self.assertEqual(hpo.param_grid, deserialized.param_grid)
            self.assertEqual(hpo.estimator.get_params(),
                             deserialized.estimator.get_params())
            hpo_params = hpo.get_params(deep=False)
            deserialized_params = deserialized.get_params(deep=False)
            del hpo_params['estimator']
            del deserialized_params['estimator']
            self.assertEqual(hpo_params, deserialized_params)
Пример #17
0
    def test_serialize_simple_parameter_grid(self):

        # We cannot easily test for scipy random variables in here, but they
        # should be covered

        # Examples from the scikit-learn documentation
        models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()]
        grids = \
            [[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
               'kernel': ['rbf']}],
             {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}]

        for grid, model in zip(grids, models):
            serialized = sklearn_to_flow(grid)
            deserialized = flow_to_sklearn(serialized)

            self.assertEqual(deserialized, grid)
            self.assertIsNot(deserialized, grid)

            hpo = sklearn.model_selection.GridSearchCV(
                param_grid=grid, estimator=model)

            serialized = sklearn_to_flow(hpo)
            deserialized = flow_to_sklearn(serialized)
            self.assertEqual(hpo.param_grid, deserialized.param_grid)
            self.assertEqual(hpo.estimator.get_params(),
                             deserialized.estimator.get_params())
            hpo_params = hpo.get_params(deep=False)
            deserialized_params = deserialized.get_params(deep=False)
            del hpo_params['estimator']
            del deserialized_params['estimator']
            self.assertEqual(hpo_params, deserialized_params)
Пример #18
0
    def test_serialize_rvs(self):
        supported_rvs = [
            scipy.stats.norm(loc=1, scale=5),
            scipy.stats.expon(loc=1, scale=5),
            scipy.stats.randint(low=-3, high=15)
        ]

        for supported_rv in supported_rvs:
            serialized = sklearn_to_flow(supported_rv)
            deserialized = flow_to_sklearn(serialized)
            self.assertEqual(type(deserialized.dist), type(supported_rv.dist))
            del deserialized.dist
            del supported_rv.dist
            self.assertEqual(deserialized.__dict__, supported_rv.__dict__)
Пример #19
0
    def test_serialize_model_with_subcomponent(self):
        model = sklearn.ensemble.AdaBoostClassifier(
            n_estimators=100,
            base_estimator=sklearn.tree.DecisionTreeClassifier())

        fixture_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' \
                       '(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)'
        fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'
        fixture_description = 'Automatically created scikit-learn flow.'
        fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier'
        fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier'
        fixture_subcomponent_description = 'Automatically created scikit-learn flow.'

        serialization = sklearn_to_flow(model)

        self.assertEqual(serialization.name, fixture_name)
        self.assertEqual(serialization.class_name, fixture_class_name)
        self.assertEqual(serialization.description, fixture_description)
        self.assertEqual(serialization.parameters['algorithm'], '"SAMME.R"')
        self.assertIsInstance(serialization.parameters['base_estimator'], str)
        self.assertEqual(serialization.parameters['learning_rate'], '1.0')
        self.assertEqual(serialization.parameters['n_estimators'], '100')
        self.assertEqual(serialization.components['base_estimator'].name,
                         fixture_subcomponent_name)
        self.assertEqual(serialization.components['base_estimator'].class_name,
                         fixture_subcomponent_class_name)
        self.assertEqual(
            serialization.components['base_estimator'].description,
            fixture_subcomponent_description)

        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(model))
        self.assertIsNot(new_model, model)

        self.assertIsNot(new_model.base_estimator, model.base_estimator)
        self.assertEqual(new_model.base_estimator.get_params(),
                         model.base_estimator.get_params())
        new_model_params = new_model.get_params()
        del new_model_params['base_estimator']
        model_params = model.get_params()
        del model_params['base_estimator']

        self.assertEqual(new_model_params, model_params)
        new_model.fit(self.X, self.y)
Пример #20
0
    def test_serialize_cvobject(self):
        methods = [sklearn.model_selection.KFold(3),
                   sklearn.model_selection.LeaveOneOut()]
        fixtures = [OrderedDict([('oml-python:serialized_object', 'cv_object'),
                                 ('value', OrderedDict([('name', 'sklearn.model_selection._split.KFold'),
                                                        ('parameters', OrderedDict([('n_splits', '3'),
                                                                                    ('random_state', 'null'),
                                                                                    ('shuffle', 'false')]))]))]),
                    OrderedDict([('oml-python:serialized_object', 'cv_object'),
                                 ('value', OrderedDict([('name', 'sklearn.model_selection._split.LeaveOneOut'),
                                                        ('parameters', OrderedDict())]))])]
        for method, fixture in zip(methods, fixtures):
            m = sklearn_to_flow(method)
            self.assertEqual(m, fixture)

            m_new = flow_to_sklearn(m)
            self.assertIsNot(m_new, m)
            self.assertIsInstance(m_new, type(method))
Пример #21
0
    def test_serialize_model_with_subcomponent(self):
        model = sklearn.ensemble.AdaBoostClassifier(
            n_estimators=100, base_estimator=sklearn.tree.DecisionTreeClassifier())

        fixture_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' \
                       '(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)'
        fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'
        fixture_description = 'Automatically created scikit-learn flow.'
        fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier'
        fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier'
        fixture_subcomponent_description = 'Automatically created scikit-learn flow.'

        serialization =  sklearn_to_flow(model)

        self.assertEqual(serialization.name, fixture_name)
        self.assertEqual(serialization.class_name, fixture_class_name)
        self.assertEqual(serialization.description, fixture_description)
        self.assertEqual(serialization.parameters['algorithm'], '"SAMME.R"')
        self.assertIsInstance(serialization.parameters['base_estimator'], str)
        self.assertEqual(serialization.parameters['learning_rate'], '1.0')
        self.assertEqual(serialization.parameters['n_estimators'], '100')
        self.assertEqual(serialization.components['base_estimator'].name,
                         fixture_subcomponent_name)
        self.assertEqual(serialization.components['base_estimator'].class_name,
                         fixture_subcomponent_class_name)
        self.assertEqual(serialization.components['base_estimator'].description,
                         fixture_subcomponent_description)

        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(model))
        self.assertIsNot(new_model, model)

        self.assertIsNot(new_model.base_estimator, model.base_estimator)
        self.assertEqual(new_model.base_estimator.get_params(),
                         model.base_estimator.get_params())
        new_model_params = new_model.get_params()
        del new_model_params['base_estimator']
        model_params = model.get_params()
        del model_params['base_estimator']

        self.assertEqual(new_model_params, model_params)
        new_model.fit(self.X, self.y)
Пример #22
0
    def test_serialize_advanced_grid(self):
        # TODO instead a GridSearchCV object should be serialized

        # This needs to be in its own function because we cannot simply check
        # for the equality of the grid, because scikit-learn objects don't
        # really support the equality operator
        # This will only work with sklearn==0.18
        N_FEATURES_OPTIONS = [2, 4, 8]
        C_OPTIONS = [1, 10, 100, 1000]
        grid = [{'reduce_dim': [sklearn.decomposition.PCA(iterated_power=7),
                                sklearn.decomposition.NMF()],
                 'reduce_dim__n_components': N_FEATURES_OPTIONS,
                 'classify__C': C_OPTIONS},
                {'reduce_dim': [sklearn.feature_selection.SelectKBest(
                                sklearn.feature_selection.chi2)],
                 'reduce_dim__k': N_FEATURES_OPTIONS,
                 'classify__C': C_OPTIONS}]

        serialized = sklearn_to_flow(grid)
        deserialized = flow_to_sklearn(serialized)

        self.assertEqual(grid[0]['reduce_dim'][0].get_params(),
                         deserialized[0]['reduce_dim'][0].get_params())
        self.assertIsNot(grid[0]['reduce_dim'][0],
                         deserialized[0]['reduce_dim'][0])
        self.assertEqual(grid[0]['reduce_dim'][1].get_params(),
                         deserialized[0]['reduce_dim'][1].get_params())
        self.assertIsNot(grid[0]['reduce_dim'][1],
                         deserialized[0]['reduce_dim'][1])
        self.assertEqual(grid[0]['reduce_dim__n_components'],
                         deserialized[0]['reduce_dim__n_components'])
        self.assertEqual(grid[0]['classify__C'],
                         deserialized[0]['classify__C'])
        self.assertEqual(grid[1]['reduce_dim'][0].get_params(),
                         deserialized[1]['reduce_dim'][0].get_params())
        self.assertIsNot(grid[1]['reduce_dim'][0],
                         deserialized[1]['reduce_dim'][0])
        self.assertEqual(grid[1]['reduce_dim__k'],
                         deserialized[1]['reduce_dim__k'])
        self.assertEqual(grid[1]['classify__C'],
                         deserialized[1]['classify__C'])
Пример #23
0
    def test_serialize_model(self, check_dependencies_mock):
        model = sklearn.tree.DecisionTreeClassifier(criterion='entropy',
                                                    max_features='auto',
                                                    max_leaf_nodes=2000)

        fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier'
        fixture_description = 'Automatically created scikit-learn flow.'
        version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
                          % sklearn.__version__
        fixture_parameters = \
            OrderedDict((('class_weight', 'null'),
                         ('criterion', '"entropy"'),
                         ('max_depth', 'null'),
                         ('max_features', '"auto"'),
                         ('max_leaf_nodes', '2000'),
                         ('min_impurity_split', '1e-07'),
                         ('min_samples_leaf', '1'),
                         ('min_samples_split', '2'),
                         ('min_weight_fraction_leaf', '0.0'),
                         ('presort', 'false'),
                         ('random_state', 'null'),
                         ('splitter', '"best"')))

        serialization = sklearn_to_flow(model)

        self.assertEqual(serialization.name, fixture_name)
        self.assertEqual(serialization.class_name, fixture_name)
        self.assertEqual(serialization.description, fixture_description)
        self.assertEqual(serialization.parameters, fixture_parameters)
        self.assertEqual(serialization.dependencies, version_fixture)

        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(model))
        self.assertIsNot(new_model, model)

        self.assertEqual(new_model.get_params(), model.get_params())
        new_model.fit(self.X, self.y)

        self.assertEqual(check_dependencies_mock.call_count, 1)
Пример #24
0
    def test_serialize_model(self, check_dependencies_mock):
        model = sklearn.tree.DecisionTreeClassifier(criterion='entropy',
                                                    max_features='auto',
                                                    max_leaf_nodes=2000)

        fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier'
        fixture_description = 'Automatically created scikit-learn flow.'
        version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
                          % sklearn.__version__
        fixture_parameters = \
            OrderedDict((('class_weight', 'null'),
                         ('criterion', '"entropy"'),
                         ('max_depth', 'null'),
                         ('max_features', '"auto"'),
                         ('max_leaf_nodes', '2000'),
                         ('min_impurity_split', '1e-07'),
                         ('min_samples_leaf', '1'),
                         ('min_samples_split', '2'),
                         ('min_weight_fraction_leaf', '0.0'),
                         ('presort', 'false'),
                         ('random_state', 'null'),
                         ('splitter', '"best"')))

        serialization = sklearn_to_flow(model)

        self.assertEqual(serialization.name, fixture_name)
        self.assertEqual(serialization.class_name, fixture_name)
        self.assertEqual(serialization.description, fixture_description)
        self.assertEqual(serialization.parameters, fixture_parameters)
        self.assertEqual(serialization.dependencies, version_fixture)

        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(model))
        self.assertIsNot(new_model, model)

        self.assertEqual(new_model.get_params(), model.get_params())
        new_model.fit(self.X, self.y)

        self.assertEqual(check_dependencies_mock.call_count, 1)
Пример #25
0
    def test_serialize_model_clustering(self, check_dependencies_mock):
        model = sklearn.cluster.KMeans()

        fixture_name = 'sklearn.cluster.k_means_.KMeans'
        fixture_description = 'Automatically created scikit-learn flow.'
        version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
                          % sklearn.__version__
        fixture_parameters = \
            OrderedDict((('algorithm', '"auto"'),
                         ('copy_x', 'true'),
                         ('init', '"k-means++"'),
                         ('max_iter', '300'),
                         ('n_clusters', '8'),
                         ('n_init', '10'),
                         ('n_jobs', '1'),
                         ('precompute_distances', '"auto"'),
                         ('random_state', 'null'),
                         ('tol', '0.0001'),
                         ('verbose', '0')))

        serialization = sklearn_to_flow(model)

        self.assertEqual(serialization.name, fixture_name)
        self.assertEqual(serialization.class_name, fixture_name)
        self.assertEqual(serialization.description, fixture_description)
        self.assertEqual(serialization.parameters, fixture_parameters)
        self.assertEqual(serialization.dependencies, version_fixture)

        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(model))
        self.assertIsNot(new_model, model)

        self.assertEqual(new_model.get_params(), model.get_params())
        new_model.fit(self.X)

        self.assertEqual(check_dependencies_mock.call_count, 1)
Пример #26
0
def setups_to_configspace(setups,
                          default_params,
                          keyfield='parameter_name',
                          logscale_parameters=None,
                          ignore_parameters=None,
                          ignore_constants=True):
    # setups is result from openml.setups.list_setups call
    # note that this config space is not equal to the one
    # obtained from auto-sklearn; but useful for creating
    # the pcs file
    parameter_values = {}
    flow_id = None
    for setup_id in setups:
        current = setups[setup_id]
        if flow_id is None:
            flow_id = current.flow_id
        else:
            if current.flow_id != flow_id:
                raise ValueError(
                    'flow ids are expected to be equal. Expected %d, saw %s' %
                    (flow_id, current.flow_id))

        for param_id in current.parameters.keys():
            name = getattr(current.parameters[param_id], keyfield)
            value = current.parameters[param_id].value
            if name not in parameter_values.keys():
                parameter_values[name] = set()
            parameter_values[name].add(value)

    uncovered = set(parameter_values.keys()) - set(default_params.keys())
    if len(uncovered) > 0:
        raise ValueError(
            'Mismatch between keys default_params and parameter_values. Missing'
            % str(uncovered))

    def is_castable_to(value, type):
        try:
            type(value)
            return True
        except ValueError:
            return False

    cs = ConfigurationSpace()
    if logscale_parameters is None:
        logscale_parameters = set()
    # for parameter in logscale_parameters:
    #     if parameter not in parameter_values.keys():
    #         raise ValueError('(Logscale) Parameter not recognized: %s' %parameter)

    constants = set()
    for name in parameter_values.keys():
        if ignore_parameters is not None and name in ignore_parameters:
            continue

        all_values = parameter_values[name]
        if len(all_values) <= 1:
            constants.add(name)
            if ignore_constants:
                continue

        if all(is_castable_to(item, int) for item in all_values):
            all_values = [int(item) for item in all_values]
            lower = min(all_values)
            upper = max(all_values)
            default = default_params[name]
            if not is_castable_to(default, int):
                sys.stderr.write(
                    'Illegal default for parameter %s (expected int): %s' %
                    (name, str(default)))
                default = int(lower + lower + upper / 2)

            hyper = UniformIntegerHyperparameter(name=name,
                                                 lower=lower,
                                                 upper=upper,
                                                 default=default,
                                                 log=name
                                                 in logscale_parameters)
            cs.add_hyperparameter(hyper)
        elif all(is_castable_to(item, float) for item in all_values):
            all_values = [float(item) for item in all_values]
            lower = min(all_values)
            upper = max(all_values)
            default = default_params[name]
            if not is_castable_to(default, float):
                sys.stderr.write(
                    'Illegal default for parameter %s (expected int): %s' %
                    (name, str(default)))
                default = lower + lower + upper / 2

            hyper = UniformFloatHyperparameter(name=name,
                                               lower=lower,
                                               upper=upper,
                                               default=default,
                                               log=name in logscale_parameters)
            cs.add_hyperparameter(hyper)
        else:
            values = [flow_to_sklearn(item) for item in all_values]
            hyper = CategoricalHyperparameter(name=name,
                                              choices=values,
                                              default=default_params[name])
            cs.add_hyperparameter(hyper)
    return cs, constants
Пример #27
0
 def test_serialize_function(self):
     serialized =  sklearn_to_flow(sklearn.feature_selection.chi2)
     deserialized = flow_to_sklearn(serialized)
     self.assertEqual(deserialized, sklearn.feature_selection.chi2)
Пример #28
0
    def test_serialize_pipeline_clustering(self):
        scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        km = sklearn.cluster.KMeans()
        model = sklearn.pipeline.Pipeline(steps=(('scaler', scaler),
                                                 ('clusterer', km)))

        fixture_name = 'sklearn.pipeline.Pipeline(' \
                       'scaler=sklearn.preprocessing.data.StandardScaler,' \
                       'clusterer=sklearn.cluster.k_means_.KMeans)'
        fixture_description = 'Automatically created scikit-learn flow.'

        serialization = sklearn_to_flow(model)

        self.assertEqual(serialization.name, fixture_name)
        self.assertEqual(serialization.description, fixture_description)

        # Comparing the pipeline
        # The parameters only have the name of base objects(not the whole flow)
        # as value
        self.assertEqual(len(serialization.parameters), 1)
        # Hard to compare two representations of a dict due to possibly
        # different sorting. Making a json makes it easier
        self.assertEqual(
            json.loads(serialization.parameters['steps']),
            [{
                'oml-python:serialized_object': 'component_reference',
                'value': {
                    'key': 'scaler',
                    'step_name': 'scaler'
                }
            }, {
                'oml-python:serialized_object': 'component_reference',
                'value': {
                    'key': 'clusterer',
                    'step_name': 'clusterer'
                }
            }])

        # Checking the sub-component
        self.assertEqual(len(serialization.components), 2)
        self.assertIsInstance(serialization.components['scaler'], OpenMLFlow)
        self.assertIsInstance(serialization.components['clusterer'],
                              OpenMLFlow)

        # del serialization.model
        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(model))
        self.assertIsNot(new_model, model)

        self.assertEqual([step[0] for step in new_model.steps],
                         [step[0] for step in model.steps])
        self.assertIsNot(new_model.steps[0][1], model.steps[0][1])
        self.assertIsNot(new_model.steps[1][1], model.steps[1][1])

        new_model_params = new_model.get_params()
        del new_model_params['scaler']
        del new_model_params['clusterer']
        del new_model_params['steps']
        fu_params = model.get_params()
        del fu_params['scaler']
        del fu_params['clusterer']
        del fu_params['steps']

        self.assertEqual(new_model_params, fu_params)
        new_model.fit(self.X, self.y)
Пример #29
0
 def test_serialize_function(self):
     serialized = sklearn_to_flow(sklearn.feature_selection.chi2)
     deserialized = flow_to_sklearn(serialized)
     self.assertEqual(deserialized, sklearn.feature_selection.chi2)