def flow_to_sklearn_with_hack(self, flow): copyFlow = flow if copyFlow.flow_id == 5804: copyFlow.dependencies = u'sklearn==0.20.3\nnumpy>=1.13.0\nscipy>=1.0' for v in copyFlow.components.values(): v.dependencies = u'sklearn==0.20.3\nnumpy>=1.13.0\nscipy>=1.0' return flows.flow_to_sklearn(copyFlow) else: return flows.flow_to_sklearn(copyFlow)
def test_serialize_feature_union(self): ohe = sklearn.preprocessing.OneHotEncoder(sparse=False) scaler = sklearn.preprocessing.StandardScaler() fu = sklearn.pipeline.FeatureUnion( transformer_list=[('ohe', ohe), ('scaler', scaler)]) serialization = sklearn_to_flow(fu) self.assertEqual( serialization.name, 'sklearn.pipeline.FeatureUnion(' 'ohe=sklearn.preprocessing.data.OneHotEncoder,' 'scaler=sklearn.preprocessing.data.StandardScaler)') new_model = flow_to_sklearn(serialization) self.assertEqual(type(new_model), type(fu)) self.assertIsNot(new_model, fu) self.assertEqual(new_model.transformer_list[0][0], fu.transformer_list[0][0]) self.assertEqual(new_model.transformer_list[0][1].get_params(), fu.transformer_list[0][1].get_params()) self.assertEqual(new_model.transformer_list[1][0], fu.transformer_list[1][0]) self.assertEqual(new_model.transformer_list[1][1].get_params(), fu.transformer_list[1][1].get_params()) self.assertEqual([step[0] for step in new_model.transformer_list], [step[0] for step in fu.transformer_list]) self.assertIsNot(new_model.transformer_list[0][1], fu.transformer_list[0][1]) self.assertIsNot(new_model.transformer_list[1][1], fu.transformer_list[1][1]) new_model_params = new_model.get_params() del new_model_params['ohe'] del new_model_params['scaler'] del new_model_params['transformer_list'] fu_params = fu.get_params() del fu_params['ohe'] del fu_params['scaler'] del fu_params['transformer_list'] self.assertEqual(new_model_params, fu_params) new_model.fit(self.X, self.y) fu.set_params(scaler=None) serialization = sklearn_to_flow(fu) self.assertEqual( serialization.name, 'sklearn.pipeline.FeatureUnion(' 'ohe=sklearn.preprocessing.data.OneHotEncoder)') new_model = flow_to_sklearn(serialization) self.assertEqual(type(new_model), type(fu)) self.assertIsNot(new_model, fu) self.assertIs(new_model.transformer_list[1][1], None)
def test_serialize_feature_union(self): ohe = sklearn.preprocessing.OneHotEncoder(sparse=False) scaler = sklearn.preprocessing.StandardScaler() fu = sklearn.pipeline.FeatureUnion(transformer_list=[('ohe', ohe), ('scaler', scaler)]) serialization = sklearn_to_flow(fu) self.assertEqual(serialization.name, 'sklearn.pipeline.FeatureUnion(' 'ohe=sklearn.preprocessing.data.OneHotEncoder,' 'scaler=sklearn.preprocessing.data.StandardScaler)') new_model = flow_to_sklearn(serialization) self.assertEqual(type(new_model), type(fu)) self.assertIsNot(new_model, fu) self.assertEqual(new_model.transformer_list[0][0], fu.transformer_list[0][0]) self.assertEqual(new_model.transformer_list[0][1].get_params(), fu.transformer_list[0][1].get_params()) self.assertEqual(new_model.transformer_list[1][0], fu.transformer_list[1][0]) self.assertEqual(new_model.transformer_list[1][1].get_params(), fu.transformer_list[1][1].get_params()) self.assertEqual([step[0] for step in new_model.transformer_list], [step[0] for step in fu.transformer_list]) self.assertIsNot(new_model.transformer_list[0][1], fu.transformer_list[0][1]) self.assertIsNot(new_model.transformer_list[1][1], fu.transformer_list[1][1]) new_model_params = new_model.get_params() del new_model_params['ohe'] del new_model_params['scaler'] del new_model_params['transformer_list'] fu_params = fu.get_params() del fu_params['ohe'] del fu_params['scaler'] del fu_params['transformer_list'] self.assertEqual(new_model_params, fu_params) new_model.fit(self.X, self.y) fu.set_params(scaler=None) serialization = sklearn_to_flow(fu) self.assertEqual(serialization.name, 'sklearn.pipeline.FeatureUnion(' 'ohe=sklearn.preprocessing.data.OneHotEncoder)') new_model = flow_to_sklearn(serialization) self.assertEqual(type(new_model), type(fu)) self.assertIsNot(new_model, fu) self.assertIs(new_model.transformer_list[1][1], None)
def test_serialize_complex_flow(self): ohe = sklearn.preprocessing.OneHotEncoder(categorical_features=[0]) scaler = sklearn.preprocessing.StandardScaler(with_mean=False) boosting = sklearn.ensemble.AdaBoostClassifier( base_estimator=sklearn.tree.DecisionTreeClassifier()) model = sklearn.pipeline.Pipeline(steps=( ('ohe', ohe), ('scaler', scaler), ('boosting', boosting))) parameter_grid = {'n_estimators': [1, 5, 10, 100], 'learning_rate': scipy.stats.uniform(0.01, 0.99), 'base_estimator__max_depth': scipy.stats.randint(1, 10)} cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True) rs = sklearn.model_selection.RandomizedSearchCV( estimator=model, param_distributions=parameter_grid, cv=cv) serialized = sklearn_to_flow(rs) fixture_name = 'sklearn.model_selection._search.RandomizedSearchCV(' \ 'estimator=sklearn.pipeline.Pipeline(' \ 'ohe=sklearn.preprocessing.data.OneHotEncoder,' \ 'scaler=sklearn.preprocessing.data.StandardScaler,' \ 'boosting=sklearn.ensemble.weight_boosting.AdaBoostClassifier(' \ 'base_estimator=sklearn.tree.tree.DecisionTreeClassifier)))' self.assertEqual(serialized.name, fixture_name) # now do deserialization deserialized = flow_to_sklearn(serialized) # Checks that sklearn_to_flow is idempotent. serialized2 = sklearn_to_flow(deserialized) self.assertNotEqual(rs, deserialized) # Would raise an exception if the flows would be unequal assert_flows_equal(serialized, serialized2)
def test_serialize_complex_flow(self): ohe = sklearn.preprocessing.OneHotEncoder(categorical_features=[0]) scaler = sklearn.preprocessing.StandardScaler(with_mean=False) boosting = sklearn.ensemble.AdaBoostClassifier( base_estimator=sklearn.tree.DecisionTreeClassifier()) model = sklearn.pipeline.Pipeline(steps=(('ohe', ohe), ('scaler', scaler), ('boosting', boosting))) parameter_grid = { 'n_estimators': [1, 5, 10, 100], 'learning_rate': scipy.stats.uniform(0.01, 0.99), 'base_estimator__max_depth': scipy.stats.randint(1, 10) } cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True) rs = sklearn.model_selection.RandomizedSearchCV( estimator=model, param_distributions=parameter_grid, cv=cv) serialized = sklearn_to_flow(rs) fixture_name = 'sklearn.model_selection._search.RandomizedSearchCV(' \ 'estimator=sklearn.pipeline.Pipeline(' \ 'ohe=sklearn.preprocessing.data.OneHotEncoder,' \ 'scaler=sklearn.preprocessing.data.StandardScaler,' \ 'boosting=sklearn.ensemble.weight_boosting.AdaBoostClassifier(' \ 'base_estimator=sklearn.tree.tree.DecisionTreeClassifier)))' self.assertEqual(serialized.name, fixture_name) # now do deserialization deserialized = flow_to_sklearn(serialized) # Checks that sklearn_to_flow is idempotent. serialized2 = sklearn_to_flow(deserialized) self.assertNotEqual(rs, deserialized) # Would raise an exception if the flows would be unequal assert_flows_equal(serialized, serialized2)
def test_serialize_cvobject(self): methods = [ sklearn.model_selection.KFold(3), sklearn.model_selection.LeaveOneOut() ] fixtures = [ OrderedDict([ ('oml-python:serialized_object', 'cv_object'), ('value', OrderedDict([('name', 'sklearn.model_selection._split.KFold'), ('parameters', OrderedDict([('n_splits', '3'), ('random_state', 'null'), ('shuffle', 'false')]))])) ]), OrderedDict([ ('oml-python:serialized_object', 'cv_object'), ('value', OrderedDict([('name', 'sklearn.model_selection._split.LeaveOneOut'), ('parameters', OrderedDict())])) ]) ] for method, fixture in zip(methods, fixtures): m = sklearn_to_flow(method) self.assertEqual(m, fixture) m_new = flow_to_sklearn(m) self.assertIsNot(m_new, m) self.assertIsInstance(m_new, type(method))
def test_serialize_type(self): supported_types = [float, np.float, np.float32, np.float64, int, np.int, np.int32, np.int64] for supported_type in supported_types: serialized = sklearn_to_flow(supported_type) deserialized = flow_to_sklearn(serialized) self.assertEqual(deserialized, supported_type)
def test_serialize_resampling(self): kfold = sklearn.model_selection.StratifiedKFold( n_splits=4, shuffle=True) serialized = sklearn_to_flow(kfold) deserialized = flow_to_sklearn(serialized) # Best approximation to get_params() self.assertEqual(str(deserialized), str(kfold)) self.assertIsNot(deserialized, kfold)
def test_serialize_resampling(self): kfold = sklearn.model_selection.StratifiedKFold(n_splits=4, shuffle=True) serialized = sklearn_to_flow(kfold) deserialized = flow_to_sklearn(serialized) # Best approximation to get_params() self.assertEqual(str(deserialized), str(kfold)) self.assertIsNot(deserialized, kfold)
def test_serialize_pipeline(self): scaler = sklearn.preprocessing.StandardScaler(with_mean=False) dummy = sklearn.dummy.DummyClassifier(strategy='prior') model = sklearn.pipeline.Pipeline(steps=( ('scaler', scaler), ('dummy', dummy))) fixture_name = 'sklearn.pipeline.Pipeline(' \ 'scaler=sklearn.preprocessing.data.StandardScaler,' \ 'dummy=sklearn.dummy.DummyClassifier)' fixture_description = 'Automatically created scikit-learn flow.' serialization = sklearn_to_flow(model) self.assertEqual(serialization.name, fixture_name) self.assertEqual(serialization.description, fixture_description) # Comparing the pipeline # The parameters only have the name of base objects(not the whole flow) # as value self.assertEqual(len(serialization.parameters), 1) # Hard to compare two representations of a dict due to possibly # different sorting. Making a json makes it easier self.assertEqual(json.loads(serialization.parameters['steps']), [{'oml-python:serialized_object': 'component_reference', 'value': {'key': 'scaler', 'step_name': 'scaler'}}, {'oml-python:serialized_object': 'component_reference', 'value': {'key': 'dummy', 'step_name': 'dummy'}}]) # Checking the sub-component self.assertEqual(len(serialization.components), 2) self.assertIsInstance(serialization.components['scaler'], OpenMLFlow) self.assertIsInstance(serialization.components['dummy'], OpenMLFlow) #del serialization.model new_model = flow_to_sklearn(serialization) self.assertEqual(type(new_model), type(model)) self.assertIsNot(new_model, model) self.assertEqual([step[0] for step in new_model.steps], [step[0] for step in model.steps]) self.assertIsNot(new_model.steps[0][1], model.steps[0][1]) self.assertIsNot(new_model.steps[1][1], model.steps[1][1]) new_model_params = new_model.get_params() del new_model_params['scaler'] del new_model_params['dummy'] del new_model_params['steps'] fu_params = model.get_params() del fu_params['scaler'] del fu_params['dummy'] del fu_params['steps'] self.assertEqual(new_model_params, fu_params) new_model.fit(self.X, self.y)
def test_serialize_type(self): supported_types = [ float, np.float, np.float32, np.float64, int, np.int, np.int32, np.int64 ] for supported_type in supported_types: serialized = sklearn_to_flow(supported_type) deserialized = flow_to_sklearn(serialized) self.assertEqual(deserialized, supported_type)
def test_hypothetical_parameter_values(self): # The hypothetical parameter values of true, 1, 0.1 formatted as a # string (and their correct serialization and deserialization) an only # be checked inside a model model = Model('true', '1', '0.1') serialized = sklearn_to_flow(model) deserialized = flow_to_sklearn(serialized) self.assertEqual(deserialized.get_params(), model.get_params()) self.assertIsNot(deserialized, model)
def test_serialize_advanced_grid(self): # TODO instead a GridSearchCV object should be serialized # This needs to be in its own function because we cannot simply check # for the equality of the grid, because scikit-learn objects don't # really support the equality operator # This will only work with sklearn==0.18 N_FEATURES_OPTIONS = [2, 4, 8] C_OPTIONS = [1, 10, 100, 1000] grid = [{ 'reduce_dim': [ sklearn.decomposition.PCA(iterated_power=7), sklearn.decomposition.NMF() ], 'reduce_dim__n_components': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, { 'reduce_dim': [ sklearn.feature_selection.SelectKBest( sklearn.feature_selection.chi2) ], 'reduce_dim__k': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }] serialized = sklearn_to_flow(grid) deserialized = flow_to_sklearn(serialized) self.assertEqual(grid[0]['reduce_dim'][0].get_params(), deserialized[0]['reduce_dim'][0].get_params()) self.assertIsNot(grid[0]['reduce_dim'][0], deserialized[0]['reduce_dim'][0]) self.assertEqual(grid[0]['reduce_dim'][1].get_params(), deserialized[0]['reduce_dim'][1].get_params()) self.assertIsNot(grid[0]['reduce_dim'][1], deserialized[0]['reduce_dim'][1]) self.assertEqual(grid[0]['reduce_dim__n_components'], deserialized[0]['reduce_dim__n_components']) self.assertEqual(grid[0]['classify__C'], deserialized[0]['classify__C']) self.assertEqual(grid[1]['reduce_dim'][0].get_params(), deserialized[1]['reduce_dim'][0].get_params()) self.assertIsNot(grid[1]['reduce_dim'][0], deserialized[1]['reduce_dim'][0]) self.assertEqual(grid[1]['reduce_dim__k'], deserialized[1]['reduce_dim__k']) self.assertEqual(grid[1]['classify__C'], deserialized[1]['classify__C'])
def test_serialize_rvs(self): supported_rvs = [scipy.stats.norm(loc=1, scale=5), scipy.stats.expon(loc=1, scale=5), scipy.stats.randint(low=-3, high=15)] for supported_rv in supported_rvs: serialized = sklearn_to_flow(supported_rv) deserialized = flow_to_sklearn(serialized) self.assertEqual(type(deserialized.dist), type(supported_rv.dist)) del deserialized.dist del supported_rv.dist self.assertEqual(deserialized.__dict__, supported_rv.__dict__)
def test_serialize_simple_parameter_grid(self): # We cannot easily test for scipy random variables in here, but they # should be covered # Examples from the scikit-learn documentation models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()] grids = \ [[{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}], {"max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [1, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"]}] for grid, model in zip(grids, models): serialized = sklearn_to_flow(grid) deserialized = flow_to_sklearn(serialized) self.assertEqual(deserialized, grid) self.assertIsNot(deserialized, grid) hpo = sklearn.model_selection.GridSearchCV(param_grid=grid, estimator=model) serialized = sklearn_to_flow(hpo) deserialized = flow_to_sklearn(serialized) self.assertEqual(hpo.param_grid, deserialized.param_grid) self.assertEqual(hpo.estimator.get_params(), deserialized.estimator.get_params()) hpo_params = hpo.get_params(deep=False) deserialized_params = deserialized.get_params(deep=False) del hpo_params['estimator'] del deserialized_params['estimator'] self.assertEqual(hpo_params, deserialized_params)
def test_serialize_simple_parameter_grid(self): # We cannot easily test for scipy random variables in here, but they # should be covered # Examples from the scikit-learn documentation models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()] grids = \ [[{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}], {"max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [1, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"]}] for grid, model in zip(grids, models): serialized = sklearn_to_flow(grid) deserialized = flow_to_sklearn(serialized) self.assertEqual(deserialized, grid) self.assertIsNot(deserialized, grid) hpo = sklearn.model_selection.GridSearchCV( param_grid=grid, estimator=model) serialized = sklearn_to_flow(hpo) deserialized = flow_to_sklearn(serialized) self.assertEqual(hpo.param_grid, deserialized.param_grid) self.assertEqual(hpo.estimator.get_params(), deserialized.estimator.get_params()) hpo_params = hpo.get_params(deep=False) deserialized_params = deserialized.get_params(deep=False) del hpo_params['estimator'] del deserialized_params['estimator'] self.assertEqual(hpo_params, deserialized_params)
def test_serialize_rvs(self): supported_rvs = [ scipy.stats.norm(loc=1, scale=5), scipy.stats.expon(loc=1, scale=5), scipy.stats.randint(low=-3, high=15) ] for supported_rv in supported_rvs: serialized = sklearn_to_flow(supported_rv) deserialized = flow_to_sklearn(serialized) self.assertEqual(type(deserialized.dist), type(supported_rv.dist)) del deserialized.dist del supported_rv.dist self.assertEqual(deserialized.__dict__, supported_rv.__dict__)
def test_serialize_model_with_subcomponent(self): model = sklearn.ensemble.AdaBoostClassifier( n_estimators=100, base_estimator=sklearn.tree.DecisionTreeClassifier()) fixture_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' \ '(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)' fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' fixture_description = 'Automatically created scikit-learn flow.' fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier' fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier' fixture_subcomponent_description = 'Automatically created scikit-learn flow.' serialization = sklearn_to_flow(model) self.assertEqual(serialization.name, fixture_name) self.assertEqual(serialization.class_name, fixture_class_name) self.assertEqual(serialization.description, fixture_description) self.assertEqual(serialization.parameters['algorithm'], '"SAMME.R"') self.assertIsInstance(serialization.parameters['base_estimator'], str) self.assertEqual(serialization.parameters['learning_rate'], '1.0') self.assertEqual(serialization.parameters['n_estimators'], '100') self.assertEqual(serialization.components['base_estimator'].name, fixture_subcomponent_name) self.assertEqual(serialization.components['base_estimator'].class_name, fixture_subcomponent_class_name) self.assertEqual( serialization.components['base_estimator'].description, fixture_subcomponent_description) new_model = flow_to_sklearn(serialization) self.assertEqual(type(new_model), type(model)) self.assertIsNot(new_model, model) self.assertIsNot(new_model.base_estimator, model.base_estimator) self.assertEqual(new_model.base_estimator.get_params(), model.base_estimator.get_params()) new_model_params = new_model.get_params() del new_model_params['base_estimator'] model_params = model.get_params() del model_params['base_estimator'] self.assertEqual(new_model_params, model_params) new_model.fit(self.X, self.y)
def test_serialize_cvobject(self): methods = [sklearn.model_selection.KFold(3), sklearn.model_selection.LeaveOneOut()] fixtures = [OrderedDict([('oml-python:serialized_object', 'cv_object'), ('value', OrderedDict([('name', 'sklearn.model_selection._split.KFold'), ('parameters', OrderedDict([('n_splits', '3'), ('random_state', 'null'), ('shuffle', 'false')]))]))]), OrderedDict([('oml-python:serialized_object', 'cv_object'), ('value', OrderedDict([('name', 'sklearn.model_selection._split.LeaveOneOut'), ('parameters', OrderedDict())]))])] for method, fixture in zip(methods, fixtures): m = sklearn_to_flow(method) self.assertEqual(m, fixture) m_new = flow_to_sklearn(m) self.assertIsNot(m_new, m) self.assertIsInstance(m_new, type(method))
def test_serialize_model_with_subcomponent(self): model = sklearn.ensemble.AdaBoostClassifier( n_estimators=100, base_estimator=sklearn.tree.DecisionTreeClassifier()) fixture_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' \ '(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)' fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' fixture_description = 'Automatically created scikit-learn flow.' fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier' fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier' fixture_subcomponent_description = 'Automatically created scikit-learn flow.' serialization = sklearn_to_flow(model) self.assertEqual(serialization.name, fixture_name) self.assertEqual(serialization.class_name, fixture_class_name) self.assertEqual(serialization.description, fixture_description) self.assertEqual(serialization.parameters['algorithm'], '"SAMME.R"') self.assertIsInstance(serialization.parameters['base_estimator'], str) self.assertEqual(serialization.parameters['learning_rate'], '1.0') self.assertEqual(serialization.parameters['n_estimators'], '100') self.assertEqual(serialization.components['base_estimator'].name, fixture_subcomponent_name) self.assertEqual(serialization.components['base_estimator'].class_name, fixture_subcomponent_class_name) self.assertEqual(serialization.components['base_estimator'].description, fixture_subcomponent_description) new_model = flow_to_sklearn(serialization) self.assertEqual(type(new_model), type(model)) self.assertIsNot(new_model, model) self.assertIsNot(new_model.base_estimator, model.base_estimator) self.assertEqual(new_model.base_estimator.get_params(), model.base_estimator.get_params()) new_model_params = new_model.get_params() del new_model_params['base_estimator'] model_params = model.get_params() del model_params['base_estimator'] self.assertEqual(new_model_params, model_params) new_model.fit(self.X, self.y)
def test_serialize_advanced_grid(self): # TODO instead a GridSearchCV object should be serialized # This needs to be in its own function because we cannot simply check # for the equality of the grid, because scikit-learn objects don't # really support the equality operator # This will only work with sklearn==0.18 N_FEATURES_OPTIONS = [2, 4, 8] C_OPTIONS = [1, 10, 100, 1000] grid = [{'reduce_dim': [sklearn.decomposition.PCA(iterated_power=7), sklearn.decomposition.NMF()], 'reduce_dim__n_components': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS}, {'reduce_dim': [sklearn.feature_selection.SelectKBest( sklearn.feature_selection.chi2)], 'reduce_dim__k': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS}] serialized = sklearn_to_flow(grid) deserialized = flow_to_sklearn(serialized) self.assertEqual(grid[0]['reduce_dim'][0].get_params(), deserialized[0]['reduce_dim'][0].get_params()) self.assertIsNot(grid[0]['reduce_dim'][0], deserialized[0]['reduce_dim'][0]) self.assertEqual(grid[0]['reduce_dim'][1].get_params(), deserialized[0]['reduce_dim'][1].get_params()) self.assertIsNot(grid[0]['reduce_dim'][1], deserialized[0]['reduce_dim'][1]) self.assertEqual(grid[0]['reduce_dim__n_components'], deserialized[0]['reduce_dim__n_components']) self.assertEqual(grid[0]['classify__C'], deserialized[0]['classify__C']) self.assertEqual(grid[1]['reduce_dim'][0].get_params(), deserialized[1]['reduce_dim'][0].get_params()) self.assertIsNot(grid[1]['reduce_dim'][0], deserialized[1]['reduce_dim'][0]) self.assertEqual(grid[1]['reduce_dim__k'], deserialized[1]['reduce_dim__k']) self.assertEqual(grid[1]['classify__C'], deserialized[1]['classify__C'])
def test_serialize_model(self, check_dependencies_mock): model = sklearn.tree.DecisionTreeClassifier(criterion='entropy', max_features='auto', max_leaf_nodes=2000) fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier' fixture_description = 'Automatically created scikit-learn flow.' version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \ % sklearn.__version__ fixture_parameters = \ OrderedDict((('class_weight', 'null'), ('criterion', '"entropy"'), ('max_depth', 'null'), ('max_features', '"auto"'), ('max_leaf_nodes', '2000'), ('min_impurity_split', '1e-07'), ('min_samples_leaf', '1'), ('min_samples_split', '2'), ('min_weight_fraction_leaf', '0.0'), ('presort', 'false'), ('random_state', 'null'), ('splitter', '"best"'))) serialization = sklearn_to_flow(model) self.assertEqual(serialization.name, fixture_name) self.assertEqual(serialization.class_name, fixture_name) self.assertEqual(serialization.description, fixture_description) self.assertEqual(serialization.parameters, fixture_parameters) self.assertEqual(serialization.dependencies, version_fixture) new_model = flow_to_sklearn(serialization) self.assertEqual(type(new_model), type(model)) self.assertIsNot(new_model, model) self.assertEqual(new_model.get_params(), model.get_params()) new_model.fit(self.X, self.y) self.assertEqual(check_dependencies_mock.call_count, 1)
def test_serialize_model_clustering(self, check_dependencies_mock): model = sklearn.cluster.KMeans() fixture_name = 'sklearn.cluster.k_means_.KMeans' fixture_description = 'Automatically created scikit-learn flow.' version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \ % sklearn.__version__ fixture_parameters = \ OrderedDict((('algorithm', '"auto"'), ('copy_x', 'true'), ('init', '"k-means++"'), ('max_iter', '300'), ('n_clusters', '8'), ('n_init', '10'), ('n_jobs', '1'), ('precompute_distances', '"auto"'), ('random_state', 'null'), ('tol', '0.0001'), ('verbose', '0'))) serialization = sklearn_to_flow(model) self.assertEqual(serialization.name, fixture_name) self.assertEqual(serialization.class_name, fixture_name) self.assertEqual(serialization.description, fixture_description) self.assertEqual(serialization.parameters, fixture_parameters) self.assertEqual(serialization.dependencies, version_fixture) new_model = flow_to_sklearn(serialization) self.assertEqual(type(new_model), type(model)) self.assertIsNot(new_model, model) self.assertEqual(new_model.get_params(), model.get_params()) new_model.fit(self.X) self.assertEqual(check_dependencies_mock.call_count, 1)
def setups_to_configspace(setups, default_params, keyfield='parameter_name', logscale_parameters=None, ignore_parameters=None, ignore_constants=True): # setups is result from openml.setups.list_setups call # note that this config space is not equal to the one # obtained from auto-sklearn; but useful for creating # the pcs file parameter_values = {} flow_id = None for setup_id in setups: current = setups[setup_id] if flow_id is None: flow_id = current.flow_id else: if current.flow_id != flow_id: raise ValueError( 'flow ids are expected to be equal. Expected %d, saw %s' % (flow_id, current.flow_id)) for param_id in current.parameters.keys(): name = getattr(current.parameters[param_id], keyfield) value = current.parameters[param_id].value if name not in parameter_values.keys(): parameter_values[name] = set() parameter_values[name].add(value) uncovered = set(parameter_values.keys()) - set(default_params.keys()) if len(uncovered) > 0: raise ValueError( 'Mismatch between keys default_params and parameter_values. Missing' % str(uncovered)) def is_castable_to(value, type): try: type(value) return True except ValueError: return False cs = ConfigurationSpace() if logscale_parameters is None: logscale_parameters = set() # for parameter in logscale_parameters: # if parameter not in parameter_values.keys(): # raise ValueError('(Logscale) Parameter not recognized: %s' %parameter) constants = set() for name in parameter_values.keys(): if ignore_parameters is not None and name in ignore_parameters: continue all_values = parameter_values[name] if len(all_values) <= 1: constants.add(name) if ignore_constants: continue if all(is_castable_to(item, int) for item in all_values): all_values = [int(item) for item in all_values] lower = min(all_values) upper = max(all_values) default = default_params[name] if not is_castable_to(default, int): sys.stderr.write( 'Illegal default for parameter %s (expected int): %s' % (name, str(default))) default = int(lower + lower + upper / 2) hyper = UniformIntegerHyperparameter(name=name, lower=lower, upper=upper, default=default, log=name in logscale_parameters) cs.add_hyperparameter(hyper) elif all(is_castable_to(item, float) for item in all_values): all_values = [float(item) for item in all_values] lower = min(all_values) upper = max(all_values) default = default_params[name] if not is_castable_to(default, float): sys.stderr.write( 'Illegal default for parameter %s (expected int): %s' % (name, str(default))) default = lower + lower + upper / 2 hyper = UniformFloatHyperparameter(name=name, lower=lower, upper=upper, default=default, log=name in logscale_parameters) cs.add_hyperparameter(hyper) else: values = [flow_to_sklearn(item) for item in all_values] hyper = CategoricalHyperparameter(name=name, choices=values, default=default_params[name]) cs.add_hyperparameter(hyper) return cs, constants
def test_serialize_function(self): serialized = sklearn_to_flow(sklearn.feature_selection.chi2) deserialized = flow_to_sklearn(serialized) self.assertEqual(deserialized, sklearn.feature_selection.chi2)
def test_serialize_pipeline_clustering(self): scaler = sklearn.preprocessing.StandardScaler(with_mean=False) km = sklearn.cluster.KMeans() model = sklearn.pipeline.Pipeline(steps=(('scaler', scaler), ('clusterer', km))) fixture_name = 'sklearn.pipeline.Pipeline(' \ 'scaler=sklearn.preprocessing.data.StandardScaler,' \ 'clusterer=sklearn.cluster.k_means_.KMeans)' fixture_description = 'Automatically created scikit-learn flow.' serialization = sklearn_to_flow(model) self.assertEqual(serialization.name, fixture_name) self.assertEqual(serialization.description, fixture_description) # Comparing the pipeline # The parameters only have the name of base objects(not the whole flow) # as value self.assertEqual(len(serialization.parameters), 1) # Hard to compare two representations of a dict due to possibly # different sorting. Making a json makes it easier self.assertEqual( json.loads(serialization.parameters['steps']), [{ 'oml-python:serialized_object': 'component_reference', 'value': { 'key': 'scaler', 'step_name': 'scaler' } }, { 'oml-python:serialized_object': 'component_reference', 'value': { 'key': 'clusterer', 'step_name': 'clusterer' } }]) # Checking the sub-component self.assertEqual(len(serialization.components), 2) self.assertIsInstance(serialization.components['scaler'], OpenMLFlow) self.assertIsInstance(serialization.components['clusterer'], OpenMLFlow) # del serialization.model new_model = flow_to_sklearn(serialization) self.assertEqual(type(new_model), type(model)) self.assertIsNot(new_model, model) self.assertEqual([step[0] for step in new_model.steps], [step[0] for step in model.steps]) self.assertIsNot(new_model.steps[0][1], model.steps[0][1]) self.assertIsNot(new_model.steps[1][1], model.steps[1][1]) new_model_params = new_model.get_params() del new_model_params['scaler'] del new_model_params['clusterer'] del new_model_params['steps'] fu_params = model.get_params() del fu_params['scaler'] del fu_params['clusterer'] del fu_params['steps'] self.assertEqual(new_model_params, fu_params) new_model.fit(self.X, self.y)