def train_pipeline(pipeline, X, y, optimize_thresholds, objective): """Train a pipeline and tune the threshold if necessary. Arguments: pipeline (PipelineBase): Pipeline to train. X (ww.DataTable, pd.DataFrame): Features to train on. y (ww.DataColumn, pd.Series): Target to train on. optimize_thresholds (bool): Whether to tune the threshold (if pipeline supports it). objective (ObjectiveBase): Objective used in threshold tuning. Returns: pipeline (PipelineBase) - trained pipeline. """ X_threshold_tuning = None y_threshold_tuning = None if optimize_thresholds and pipeline.can_tune_threshold_with_objective( objective): X, X_threshold_tuning, y, y_threshold_tuning = split_data( X, y, pipeline.problem_type, test_size=0.2, random_seed=pipeline.random_seed) cv_pipeline = pipeline.clone() cv_pipeline.fit(X, y) tune_binary_threshold(cv_pipeline, objective, cv_pipeline.problem_type, X_threshold_tuning, y_threshold_tuning) return cv_pipeline
def _find_best_pipeline(self): """Finds the best pipeline in the rankings If self._best_pipeline already exists, check to make sure it is different from the current best pipeline before training and thresholding""" if len(self.rankings) == 0: return best_pipeline = self.rankings.iloc[0] if not (self._best_pipeline and self._best_pipeline == self.get_pipeline(best_pipeline['id'])): self._best_pipeline = self.get_pipeline(best_pipeline['id']) if self._train_best_pipeline: X_threshold_tuning = None y_threshold_tuning = None X_train, y_train = self.X_train, self.y_train if is_binary(self.problem_type) and self.objective.is_defined_for_problem_type(self.problem_type) \ and self.optimize_thresholds and self.objective.can_optimize_threshold: X_train, X_threshold_tuning, y_train, y_threshold_tuning = split_data( X_train, y_train, self.problem_type, test_size=0.2, random_seed=self.random_seed) self._best_pipeline.fit(X_train, y_train) tune_binary_threshold(self._best_pipeline, self.objective, self.problem_type, X_threshold_tuning, y_threshold_tuning)
def test_tune_binary_threshold(mock_fit, mock_score, mock_predict_proba, mock_optimize_threshold, dummy_binary_pipeline_class, X_y_binary): mock_optimize_threshold.return_value = 0.42 mock_score.return_value = {'F1': 1.0} X, y = X_y_binary X = infer_feature_types(X) y = infer_feature_types(y) pipeline = dummy_binary_pipeline_class({}) tune_binary_threshold(pipeline, F1(), 'binary', X, y) assert pipeline.threshold == 0.42 pipeline = dummy_binary_pipeline_class({}) tune_binary_threshold(pipeline, F1(), 'binary', None, None) assert pipeline.threshold == 0.5 pipeline = dummy_binary_pipeline_class({}) tune_binary_threshold(pipeline, F1(), 'multiclass', X, y) assert pipeline.threshold is None
def train_and_score_pipeline(pipeline, automl, full_X_train, full_y_train): """Given a pipeline, config and data, train and score the pipeline and return the CV or TV scores Arguments: pipeline (PipelineBase): The pipeline to score automl (AutoMLSearch): The AutoML search, used to access config and for the error callback full_X_train (ww.DataTable): Training features full_y_train (ww.DataColumn): Training target Returns: dict: A dict containing cv_score_mean, cv_scores, training_time and a cv_data structure with details. """ start = time.time() cv_data = [] logger.info("\tStarting cross validation") X_pd = _convert_woodwork_types_wrapper(full_X_train.to_dataframe()) y_pd = _convert_woodwork_types_wrapper(full_y_train.to_series()) for i, (train, valid) in enumerate(automl.data_splitter.split(X_pd, y_pd)): if pipeline.model_family == ModelFamily.ENSEMBLE and i > 0: # Stacked ensembles do CV internally, so we do not run CV here for performance reasons. logger.debug( f"Skipping fold {i} because CV for stacked ensembles is not supported." ) break logger.debug(f"\t\tTraining and scoring on fold {i}") X_train, X_valid = full_X_train.iloc[train], full_X_train.iloc[ valid] y_train, y_valid = full_y_train.iloc[train], full_y_train.iloc[ valid] if is_binary(automl.problem_type) or is_multiclass( automl.problem_type): diff_train = set( np.setdiff1d(full_y_train.to_series(), y_train.to_series())) diff_valid = set( np.setdiff1d(full_y_train.to_series(), y_valid.to_series())) diff_string = f"Missing target values in the training set after data split: {diff_train}. " if diff_train else "" diff_string += f"Missing target values in the validation set after data split: {diff_valid}." if diff_valid else "" if diff_string: raise Exception(diff_string) objectives_to_score = [automl.objective ] + automl.additional_objectives cv_pipeline = None try: X_threshold_tuning = None y_threshold_tuning = None if automl.optimize_thresholds and automl.objective.is_defined_for_problem_type(automl.problem_type) and \ automl.objective.can_optimize_threshold and is_binary(automl.problem_type): X_train, X_threshold_tuning, y_train, y_threshold_tuning = split_data( X_train, y_train, automl.problem_type, test_size=0.2, random_seed=automl.random_seed) cv_pipeline = pipeline.clone() logger.debug(f"\t\t\tFold {i}: starting training") cv_pipeline.fit(X_train, y_train) logger.debug(f"\t\t\tFold {i}: finished training") tune_binary_threshold(cv_pipeline, automl.objective, automl.problem_type, X_threshold_tuning, y_threshold_tuning) if X_threshold_tuning: logger.debug( f"\t\t\tFold {i}: Optimal threshold found ({cv_pipeline.threshold:.3f})" ) logger.debug(f"\t\t\tFold {i}: Scoring trained pipeline") scores = cv_pipeline.score(X_valid, y_valid, objectives=objectives_to_score) logger.debug( f"\t\t\tFold {i}: {automl.objective.name} score: {scores[automl.objective.name]:.3f}" ) score = scores[automl.objective.name] except Exception as e: if automl.error_callback is not None: automl.error_callback(exception=e, traceback=traceback.format_tb( sys.exc_info()[2]), automl=automl, fold_num=i, pipeline=pipeline) if isinstance(e, PipelineScoreError): nan_scores = { objective: np.nan for objective in e.exceptions } scores = {**nan_scores, **e.scored_successfully} scores = OrderedDict({ o.name: scores[o.name] for o in [automl.objective] + automl.additional_objectives }) score = scores[automl.objective.name] else: score = np.nan scores = OrderedDict( zip([n.name for n in automl.additional_objectives], [np.nan] * len(automl.additional_objectives))) ordered_scores = OrderedDict() ordered_scores.update({automl.objective.name: score}) ordered_scores.update(scores) ordered_scores.update({"# Training": y_train.shape[0]}) ordered_scores.update({"# Validation": y_valid.shape[0]}) evaluation_entry = { "all_objective_scores": ordered_scores, "score": score, 'binary_classification_threshold': None } if is_binary( automl.problem_type ) and cv_pipeline is not None and cv_pipeline.threshold is not None: evaluation_entry[ 'binary_classification_threshold'] = cv_pipeline.threshold cv_data.append(evaluation_entry) training_time = time.time() - start cv_scores = pd.Series([fold['score'] for fold in cv_data]) cv_score_mean = cv_scores.mean() logger.info( f"\tFinished cross validation - mean {automl.objective.name}: {cv_score_mean:.3f}" ) return { 'cv_data': cv_data, 'training_time': training_time, 'cv_scores': cv_scores, 'cv_score_mean': cv_score_mean }