Пример #1
0
    def test__run_exists(self):
        # would be better to not sentinel these clfs,
        # so we do not have to perform the actual runs
        # and can just check their status on line
        clfs = [sklearn.pipeline.Pipeline(steps=[('Imputer', Imputer(strategy='mean')),
                                                ('VarianceThreshold', VarianceThreshold(threshold=0.05)),
                                                ('Estimator', DecisionTreeClassifier(max_depth=4))]),
                sklearn.pipeline.Pipeline(steps=[('Imputer', Imputer(strategy='most_frequent')),
                                                 ('VarianceThreshold', VarianceThreshold(threshold=0.1)),
                                                 ('Estimator', DecisionTreeClassifier(max_depth=4))])]

        task = openml.tasks.get_task(115)

        for clf in clfs:
            try:
                # first populate the server with this run.
                # skip run if it was already performed.
                run = openml.runs.run_model_on_task(task, clf, avoid_duplicate_runs=True)
                run.publish()
            except openml.exceptions.PyOpenMLError as e:
                # run already existed. Great.
                pass

            flow = openml.flows.sklearn_to_flow(clf)
            flow_exists = openml.flows.flow_exists(flow.name, flow.external_version)
            self.assertGreater(flow_exists, 0)
            downloaded_flow = openml.flows.get_flow(flow_exists)
            setup_exists = openml.setups.setup_exists(downloaded_flow, clf)
            self.assertGreater(setup_exists, 0)
            run_ids = _run_exists(task.task_id, setup_exists)
            self.assertTrue(run_ids, msg=(run_ids, clf))
    def test_get_run_trace(self):
        # get_run_trace is already tested implicitly in test_run_and_publish
        # this test is a bit additional.
        num_iterations = 10
        num_folds = 1
        task_id = 119

        task = openml.tasks.get_task(task_id)
        # IMPORTANT! Do not sentinel this flow. is faster if we don't wait on openml server
        clf = RandomizedSearchCV(RandomForestClassifier(random_state=42), {
            "max_depth": [3, None],
            "max_features": [1, 2, 3, 4],
            "bootstrap": [True, False],
            "criterion": ["gini", "entropy"]
        },
                                 num_iterations,
                                 random_state=42)

        # [SPEED] make unit test faster by exploiting run information from the past
        try:
            # in case the run did not exists yet
            run = openml.runs.run_model_on_task(task,
                                                clf,
                                                avoid_duplicate_runs=True)
            trace = openml.runs.functions._create_trace_from_arff(
                run._generate_trace_arff_dict())
            self.assertEquals(
                len(trace.trace_iterations),
                num_iterations * num_folds,
            )
            run = run.publish()
            self._wait_for_processed_run(run.run_id, 200)
            run_id = run.run_id
        except openml.exceptions.PyOpenMLError as e:
            if 'Run already exists in server' not in e.message:
                # in this case the error was not the one we expected
                raise e
            # run was already
            flow = openml.flows.sklearn_to_flow(clf)
            flow_exists = openml.flows.flow_exists(flow.name,
                                                   flow.external_version)
            self.assertIsInstance(flow_exists, int)
            self.assertGreater(flow_exists, 0)
            downloaded_flow = openml.flows.get_flow(flow_exists)
            setup_exists = openml.setups.setup_exists(downloaded_flow)
            self.assertIsInstance(setup_exists, int)
            self.assertGreater(setup_exists, 0)
            run_ids = _run_exists(task.task_id, setup_exists)
            self.assertGreater(len(run_ids), 0)
            run_id = random.choice(list(run_ids))

        # now the actual unit test ...
        run_trace = openml.runs.get_run_trace(run_id)
        self.assertEqual(len(run_trace.trace_iterations),
                         num_iterations * num_folds)
    def test__run_exists(self):
        # would be better to not sentinel these clfs,
        # so we do not have to perform the actual runs
        # and can just check their status on line
        clfs = [
            sklearn.pipeline.Pipeline(
                steps=[('Imputer', Imputer(strategy='mean')),
                       ('VarianceThreshold', VarianceThreshold(threshold=0.05)
                        ), ('Estimator',
                            DecisionTreeClassifier(max_depth=4))]),
            sklearn.pipeline.Pipeline(
                steps=[('Imputer', Imputer(strategy='most_frequent')),
                       ('VarianceThreshold', VarianceThreshold(threshold=0.1)
                        ), ('Estimator', DecisionTreeClassifier(max_depth=4))])
        ]

        task = openml.tasks.get_task(115)

        for clf in clfs:
            try:
                # first populate the server with this run.
                # skip run if it was already performed.
                run = openml.runs.run_model_on_task(task,
                                                    clf,
                                                    avoid_duplicate_runs=True)
                run.publish()
            except openml.exceptions.PyOpenMLError as e:
                # run already existed. Great.
                pass

            flow = openml.flows.sklearn_to_flow(clf)
            flow_exists = openml.flows.flow_exists(flow.name,
                                                   flow.external_version)
            self.assertGreater(flow_exists, 0)
            downloaded_flow = openml.flows.get_flow(flow_exists)
            setup_exists = openml.setups.setup_exists(downloaded_flow, clf)
            self.assertGreater(setup_exists, 0)
            run_ids = _run_exists(task.task_id, setup_exists)
            self.assertTrue(run_ids, msg=(run_ids, clf))
Пример #4
0
    def test_get_run_trace(self):
        # get_run_trace is already tested implicitly in test_run_and_publish
        # this test is a bit additional.
        num_iterations = 10
        num_folds = 1
        task_id = 119

        task = openml.tasks.get_task(task_id)
        # IMPORTANT! Do not sentinel this flow. is faster if we don't wait on openml server
        clf = RandomizedSearchCV(RandomForestClassifier(random_state=42),
                                 {"max_depth": [3, None],
                                  "max_features": [1, 2, 3, 4],
                                  "bootstrap": [True, False],
                                  "criterion": ["gini", "entropy"]},
                                 num_iterations, random_state=42)

        # [SPEED] make unit test faster by exploiting run information from the past
        try:
            # in case the run did not exists yet
            run = openml.runs.run_model_on_task(task, clf, avoid_duplicate_runs=True)
            run = run.publish()
            self._wait_for_processed_run(run.run_id, 200)
            run_id = run.run_id
        except openml.exceptions.PyOpenMLError:
            # run was already
            flow = openml.flows.sklearn_to_flow(clf)
            flow_exists = openml.flows.flow_exists(flow.name, flow.external_version)
            self.assertIsInstance(flow_exists, int)
            downloaded_flow = openml.flows.get_flow(flow_exists)
            setup_exists = openml.setups.setup_exists(downloaded_flow)
            self.assertIsInstance(setup_exists, int)
            run_ids = _run_exists(task.task_id, setup_exists)
            run_id = random.choice(list(run_ids))

        # now the actual unit test ...
        run_trace = openml.runs.get_run_trace(run_id)
        self.assertEqual(len(run_trace.trace_iterations), num_iterations * num_folds)