def test__publish_flow_if_necessary(self): task_id = 115 task = openml.tasks.get_task(task_id) clf = LogisticRegression() flow = sklearn_to_flow(clf) flow, sentinel = self._add_sentinel_to_flow_name(flow, None) openml.runs.functions._publish_flow_if_necessary(flow) self.assertIsNotNone(flow.flow_id) flow2 = sklearn_to_flow(clf) flow2, _ = self._add_sentinel_to_flow_name(flow2, sentinel) openml.runs.functions._publish_flow_if_necessary(flow2) self.assertEqual(flow2.flow_id, flow.flow_id)
def test_parse_parameters_flow_not_on_server(self): model = LogisticRegression() flow = sklearn_to_flow(model) self.assertRaisesRegexp( ValueError, 'Flow sklearn.linear_model.logistic.LogisticRegression' ' has no flow_id!', OpenMLRun._parse_parameters, flow) model = AdaBoostClassifier(base_estimator=LogisticRegression()) flow = sklearn_to_flow(model) flow.flow_id = 1 self.assertRaisesRegexp( ValueError, 'Flow sklearn.linear_model.logistic.LogisticRegression' ' has no flow_id!', OpenMLRun._parse_parameters, flow)
def test_parse_parameters_flow_not_on_server(self): model = LogisticRegression() flow = sklearn_to_flow(model) self.assertRaisesRegexp(ValueError, 'Flow sklearn.linear_model.logistic.LogisticRegression ' 'has no flow_id!', OpenMLRun._parse_parameters, flow) model = AdaBoostClassifier(base_estimator=LogisticRegression()) flow = sklearn_to_flow(model) flow.flow_id = 1 self.assertRaisesRegexp(ValueError, 'Flow sklearn.linear_model.logistic.LogisticRegression ' 'has no flow_id!', OpenMLRun._parse_parameters, flow)
def test_parse_parameters(self): model = RandomizedSearchCV( estimator=RandomForestClassifier(n_estimators=5), param_distributions={"max_depth": [3, None], "max_features": [1, 2, 3, 4], "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10], "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"]}, cv=StratifiedKFold(n_splits=2, random_state=1), n_iter=5) flow = sklearn_to_flow(model) flow.flow_id = 1 flow.components['estimator'].flow_id = 2 parameters = OpenMLRun._parse_parameters(flow) for parameter in parameters: self.assertIsNotNone(parameter['oml:component'], msg=parameter) if parameter['oml:name'] == 'n_estimators': self.assertEqual(parameter['oml:value'], '5') self.assertEqual(parameter['oml:component'], 2)
def test_parse_parameters(self): model = RandomizedSearchCV( estimator=RandomForestClassifier(n_estimators=5), param_distributions={ "max_depth": [3, None], "max_features": [1, 2, 3, 4], "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10], "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"]}, cv=StratifiedKFold(n_splits=2, random_state=1), n_iter=5) flow = sklearn_to_flow(model) flow.flow_id = 1 flow.components['estimator'].flow_id = 2 parameters = OpenMLRun._parse_parameters(flow) for parameter in parameters: self.assertIsNotNone(parameter['oml:component'], msg=parameter) if parameter['oml:name'] == 'n_estimators': self.assertEqual(parameter['oml:value'], '5') self.assertEqual(parameter['oml:component'], 2)
def _perform_run(self, task_id, num_instances, clf, random_state_value=None, check_setup=True): def _remove_random_state(flow): if 'random_state' in flow.parameters: del flow.parameters['random_state'] for component in flow.components.values(): _remove_random_state(component) flow = sklearn_to_flow(clf) flow, _ = self._add_sentinel_to_flow_name(flow, None) flow.publish() task = openml.tasks.get_task(task_id) run = openml.runs.run_flow_on_task( task, flow, seed=1, avoid_duplicate_runs=openml.config.avoid_duplicate_runs) run_ = run.publish() self.assertEqual(run_, run) self.assertIsInstance(run.dataset_id, int) # check arff output self.assertEqual(len(run.data_content), num_instances) if check_setup: # test the initialize setup function run_id = run_.run_id run_server = openml.runs.get_run(run_id) clf_server = openml.setups.initialize_model(run_server.setup_id) flow_local = openml.flows.sklearn_to_flow(clf) flow_server = openml.flows.sklearn_to_flow(clf_server) if flow.class_name not in \ ['sklearn.model_selection._search.GridSearchCV', 'sklearn.pipeline.Pipeline']: # If the flow is initialized from a model without a random state, # the flow is on the server without any random state self.assertEqual(flow.parameters['random_state'], 'null') # As soon as a flow is run, a random state is set in the model. # If a flow is re-instantiated self.assertEqual(flow_local.parameters['random_state'], random_state_value) self.assertEqual(flow_server.parameters['random_state'], random_state_value) _remove_random_state(flow_local) _remove_random_state(flow_server) openml.flows.assert_flows_equal(flow_local, flow_server) # and test the initialize setup from run function clf_server2 = openml.runs.initialize_model_from_run( run_server.run_id) flow_server2 = openml.flows.sklearn_to_flow(clf_server2) if flow.class_name not in \ ['sklearn.model_selection._search.GridSearchCV', 'sklearn.pipeline.Pipeline']: self.assertEqual(flow_server2.parameters['random_state'], random_state_value) _remove_random_state(flow_server2) openml.flows.assert_flows_equal(flow_local, flow_server2) #self.assertEquals(clf.get_params(), clf_prime.get_params()) # self.assertEquals(clf, clf_prime) downloaded = openml.runs.get_run(run_.run_id) assert ('openml-python' in downloaded.tags) return run
def _perform_run(self, task_id, num_instances, clf, random_state_value=None, check_setup=True): def _remove_random_state(flow): if 'random_state' in flow.parameters: del flow.parameters['random_state'] for component in flow.components.values(): _remove_random_state(component) flow = sklearn_to_flow(clf) flow, _ = self._add_sentinel_to_flow_name(flow, None) flow.publish() task = openml.tasks.get_task(task_id) run = openml.runs.run_flow_on_task(task, flow, seed=1, avoid_duplicate_runs=openml.config.avoid_duplicate_runs) run_ = run.publish() self.assertEqual(run_, run) self.assertIsInstance(run.dataset_id, int) # check arff output self.assertEqual(len(run.data_content), num_instances) if check_setup: # test the initialize setup function run_id = run_.run_id run_server = openml.runs.get_run(run_id) clf_server = openml.setups.initialize_model(run_server.setup_id) flow_local = openml.flows.sklearn_to_flow(clf) flow_server = openml.flows.sklearn_to_flow(clf_server) if flow.class_name not in \ ['sklearn.model_selection._search.GridSearchCV', 'sklearn.pipeline.Pipeline']: # If the flow is initialized from a model without a random state, # the flow is on the server without any random state self.assertEqual(flow.parameters['random_state'], 'null') # As soon as a flow is run, a random state is set in the model. # If a flow is re-instantiated self.assertEqual(flow_local.parameters['random_state'], random_state_value) self.assertEqual(flow_server.parameters['random_state'], random_state_value) _remove_random_state(flow_local) _remove_random_state(flow_server) openml.flows.assert_flows_equal(flow_local, flow_server) # and test the initialize setup from run function clf_server2 = openml.runs.initialize_model_from_run(run_server.run_id) flow_server2 = openml.flows.sklearn_to_flow(clf_server2) if flow.class_name not in \ ['sklearn.model_selection._search.GridSearchCV', 'sklearn.pipeline.Pipeline']: self.assertEqual(flow_server2.parameters['random_state'], random_state_value) _remove_random_state(flow_server2) openml.flows.assert_flows_equal(flow_local, flow_server2) #self.assertEquals(clf.get_params(), clf_prime.get_params()) # self.assertEquals(clf, clf_prime) downloaded = openml.runs.get_run(run_.run_id) assert('openml-python' in downloaded.tags) return run