def test_submit_scoring_jobs_multiple(self): """ Test that scoring multiple pipelines using the parallel engine produces the same results as the sequential engine. """ X, y = self.X_y_binary pipelines = [BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"], parameters={"Logistic Regression Classifier": {"n_jobs": 1}}), BinaryClassificationPipeline(component_graph=["Baseline Classifier"]), BinaryClassificationPipeline(component_graph=["SVM Classifier"])] def score_pipelines(pipelines, engine): futures = [] for pipeline in pipelines: futures.append(engine.submit_training_job(X=ww.DataTable(X), y=ww.DataColumn(y), automl_config=automl_data, pipeline=pipeline)) pipelines = [f.get_result() for f in futures] futures = [] for pipeline in pipelines: futures.append(engine.submit_scoring_job(X=ww.DataTable(X), y=ww.DataColumn(y), automl_config=automl_data, pipeline=pipeline, objectives=[automl_data.objective])) results = [f.get_result() for f in futures] return results par_eval_results = score_pipelines(pipelines, DaskEngine(client=self.client)) par_scores = [s["Log Loss Binary"] for s in par_eval_results] seq_eval_results = score_pipelines(pipelines, SequentialEngine()) seq_scores = [s["Log Loss Binary"] for s in seq_eval_results] # Check there are the proper number of pipelines and all their scores are same. assert len(par_eval_results) == len(pipelines) assert set(par_scores) == set(seq_scores)
def test_submit_training_jobs_multiple(self): """ Test that training multiple pipelines using the parallel engine produces the same results as the sequential engine. """ X, y = self.X_y_binary pipelines = [BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"], parameters={"Logistic Regression Classifier": {"n_jobs": 1}}), BinaryClassificationPipeline(component_graph=["Baseline Classifier"]), BinaryClassificationPipeline(component_graph=["SVM Classifier"])] def fit_pipelines(pipelines, engine): futures = [] for pipeline in pipelines: futures.append(engine.submit_training_job(X=X, y=y, automl_config=automl_data, pipeline=pipeline)) results = [f.get_result() for f in futures] return results # Verify all pipelines are trained and fitted. seq_pipelines = fit_pipelines(pipelines, SequentialEngine()) for pipeline in seq_pipelines: assert pipeline._is_fitted # Verify all pipelines are trained and fitted. par_pipelines = fit_pipelines(pipelines, DaskEngine(client=self.client)) for pipeline in par_pipelines: assert pipeline._is_fitted # Ensure sequential and parallel pipelines are equivalent assert len(par_pipelines) == len(seq_pipelines) for par_pipeline in par_pipelines: assert par_pipeline in seq_pipelines
def test_submit_evaluate_jobs_multiple(self): """ Test that evaluating multiple pipelines using the parallel engine produces the same results as the sequential engine. """ X, y = self.X_y_binary pipelines = [ TestLRCPipeline({"Logistic Regression Classifier": { "n_jobs": 1 }}), TestBaselinePipeline({}), TestSVMPipeline({}) ] def eval_pipelines(pipelines, engine): futures = [] for pipeline in pipelines: futures.append( engine.submit_evaluation_job(X=ww.DataTable(X), y=ww.DataColumn(y), automl_config=automl_data, pipeline=pipeline)) results = [f.get_result() for f in futures] return results par_eval_results = eval_pipelines(pipelines, DaskEngine(client=self.client)) par_dicts = [s.get("scores") for s in par_eval_results] par_scores = [s["cv_data"][0]["mean_cv_score"] for s in par_dicts] par_pipelines = [s.get("pipeline") for s in par_eval_results] seq_eval_results = eval_pipelines(pipelines, SequentialEngine()) seq_dicts = [s.get("scores") for s in seq_eval_results] seq_scores = [s["cv_data"][0]["mean_cv_score"] for s in seq_dicts] seq_pipelines = [s.get("pipeline") for s in seq_eval_results] # Ensure all pipelines are fitted. assert all([s._is_fitted for s in par_pipelines]) # Ensure the scores in parallel and sequence are same assert set(par_scores) == set(seq_scores) assert not any([np.isnan(s) for s in par_scores]) # Ensure the parallel and sequence pipelines match assert len(par_pipelines) == len(seq_pipelines) for par_pipeline in par_pipelines: assert par_pipeline in seq_pipelines