示例#1
0
    def test_submit_scoring_jobs_multiple(self):
        """ Test that scoring multiple pipelines using the parallel engine produces the
        same results as the sequential engine. """
        X, y = self.X_y_binary
        pipelines = [BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"],
                                                  parameters={"Logistic Regression Classifier": {"n_jobs": 1}}),
                     BinaryClassificationPipeline(component_graph=["Baseline Classifier"]),
                     BinaryClassificationPipeline(component_graph=["SVM Classifier"])]

        def score_pipelines(pipelines, engine):
            futures = []
            for pipeline in pipelines:
                futures.append(engine.submit_training_job(X=ww.DataTable(X), y=ww.DataColumn(y),
                                                          automl_config=automl_data, pipeline=pipeline))
            pipelines = [f.get_result() for f in futures]
            futures = []
            for pipeline in pipelines:
                futures.append(engine.submit_scoring_job(X=ww.DataTable(X), y=ww.DataColumn(y),
                                                         automl_config=automl_data, pipeline=pipeline,
                                                         objectives=[automl_data.objective]))
            results = [f.get_result() for f in futures]
            return results

        par_eval_results = score_pipelines(pipelines, DaskEngine(client=self.client))
        par_scores = [s["Log Loss Binary"] for s in par_eval_results]

        seq_eval_results = score_pipelines(pipelines, SequentialEngine())
        seq_scores = [s["Log Loss Binary"] for s in seq_eval_results]

        # Check there are the proper number of pipelines and all their scores are same.
        assert len(par_eval_results) == len(pipelines)
        assert set(par_scores) == set(seq_scores)
示例#2
0
    def test_submit_training_jobs_multiple(self):
        """ Test that training multiple pipelines using the parallel engine produces the
        same results as the sequential engine. """
        X, y = self.X_y_binary
        pipelines = [BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"],
                                                  parameters={"Logistic Regression Classifier": {"n_jobs": 1}}),
                     BinaryClassificationPipeline(component_graph=["Baseline Classifier"]),
                     BinaryClassificationPipeline(component_graph=["SVM Classifier"])]

        def fit_pipelines(pipelines, engine):
            futures = []
            for pipeline in pipelines:
                futures.append(engine.submit_training_job(X=X, y=y, automl_config=automl_data, pipeline=pipeline))
            results = [f.get_result() for f in futures]
            return results

        # Verify all pipelines are trained and fitted.
        seq_pipelines = fit_pipelines(pipelines, SequentialEngine())
        for pipeline in seq_pipelines:
            assert pipeline._is_fitted

        # Verify all pipelines are trained and fitted.
        par_pipelines = fit_pipelines(pipelines, DaskEngine(client=self.client))
        for pipeline in par_pipelines:
            assert pipeline._is_fitted

        # Ensure sequential and parallel pipelines are equivalent
        assert len(par_pipelines) == len(seq_pipelines)
        for par_pipeline in par_pipelines:
            assert par_pipeline in seq_pipelines
示例#3
0
    def test_submit_evaluate_jobs_multiple(self):
        """ Test that evaluating multiple pipelines using the parallel engine produces the
        same results as the sequential engine. """
        X, y = self.X_y_binary
        pipelines = [
            TestLRCPipeline({"Logistic Regression Classifier": {
                "n_jobs": 1
            }}),
            TestBaselinePipeline({}),
            TestSVMPipeline({})
        ]

        def eval_pipelines(pipelines, engine):
            futures = []
            for pipeline in pipelines:
                futures.append(
                    engine.submit_evaluation_job(X=ww.DataTable(X),
                                                 y=ww.DataColumn(y),
                                                 automl_config=automl_data,
                                                 pipeline=pipeline))
            results = [f.get_result() for f in futures]
            return results

        par_eval_results = eval_pipelines(pipelines,
                                          DaskEngine(client=self.client))
        par_dicts = [s.get("scores") for s in par_eval_results]
        par_scores = [s["cv_data"][0]["mean_cv_score"] for s in par_dicts]
        par_pipelines = [s.get("pipeline") for s in par_eval_results]

        seq_eval_results = eval_pipelines(pipelines, SequentialEngine())
        seq_dicts = [s.get("scores") for s in seq_eval_results]
        seq_scores = [s["cv_data"][0]["mean_cv_score"] for s in seq_dicts]
        seq_pipelines = [s.get("pipeline") for s in seq_eval_results]

        # Ensure all pipelines are fitted.
        assert all([s._is_fitted for s in par_pipelines])

        # Ensure the scores in parallel and sequence are same
        assert set(par_scores) == set(seq_scores)
        assert not any([np.isnan(s) for s in par_scores])

        # Ensure the parallel and sequence pipelines match
        assert len(par_pipelines) == len(seq_pipelines)
        for par_pipeline in par_pipelines:
            assert par_pipeline in seq_pipelines