Exemplo n.º 1
0
    def train_pipeline(pipeline, X, y, optimize_thresholds, objective):
        """Train a pipeline and tune the threshold if necessary.

        Arguments:
            pipeline (PipelineBase): Pipeline to train.
            X (ww.DataTable, pd.DataFrame): Features to train on.
            y (ww.DataColumn, pd.Series): Target to train on.
            optimize_thresholds (bool): Whether to tune the threshold (if pipeline supports it).
            objective (ObjectiveBase): Objective used in threshold tuning.

        Returns:
            pipeline (PipelineBase) - trained pipeline.
        """
        X_threshold_tuning = None
        y_threshold_tuning = None
        if optimize_thresholds and pipeline.can_tune_threshold_with_objective(
                objective):
            X, X_threshold_tuning, y, y_threshold_tuning = split_data(
                X,
                y,
                pipeline.problem_type,
                test_size=0.2,
                random_seed=pipeline.random_seed)
        cv_pipeline = pipeline.clone()
        cv_pipeline.fit(X, y)
        tune_binary_threshold(cv_pipeline, objective, cv_pipeline.problem_type,
                              X_threshold_tuning, y_threshold_tuning)
        return cv_pipeline
Exemplo n.º 2
0
 def _find_best_pipeline(self):
     """Finds the best pipeline in the rankings
     If self._best_pipeline already exists, check to make sure it is different from the current best pipeline before training and thresholding"""
     if len(self.rankings) == 0:
         return
     best_pipeline = self.rankings.iloc[0]
     if not (self._best_pipeline and self._best_pipeline
             == self.get_pipeline(best_pipeline['id'])):
         self._best_pipeline = self.get_pipeline(best_pipeline['id'])
         if self._train_best_pipeline:
             X_threshold_tuning = None
             y_threshold_tuning = None
             X_train, y_train = self.X_train, self.y_train
             if is_binary(self.problem_type) and self.objective.is_defined_for_problem_type(self.problem_type) \
                and self.optimize_thresholds and self.objective.can_optimize_threshold:
                 X_train, X_threshold_tuning, y_train, y_threshold_tuning = split_data(
                     X_train,
                     y_train,
                     self.problem_type,
                     test_size=0.2,
                     random_seed=self.random_seed)
             self._best_pipeline.fit(X_train, y_train)
             tune_binary_threshold(self._best_pipeline, self.objective,
                                   self.problem_type, X_threshold_tuning,
                                   y_threshold_tuning)
Exemplo n.º 3
0
def test_tune_binary_threshold(mock_fit, mock_score, mock_predict_proba, mock_optimize_threshold,
                               dummy_binary_pipeline_class, X_y_binary):
    mock_optimize_threshold.return_value = 0.42
    mock_score.return_value = {'F1': 1.0}
    X, y = X_y_binary
    X = infer_feature_types(X)
    y = infer_feature_types(y)

    pipeline = dummy_binary_pipeline_class({})
    tune_binary_threshold(pipeline, F1(), 'binary', X, y)
    assert pipeline.threshold == 0.42

    pipeline = dummy_binary_pipeline_class({})
    tune_binary_threshold(pipeline, F1(), 'binary', None, None)
    assert pipeline.threshold == 0.5

    pipeline = dummy_binary_pipeline_class({})
    tune_binary_threshold(pipeline, F1(), 'multiclass', X, y)
    assert pipeline.threshold is None
Exemplo n.º 4
0
    def train_and_score_pipeline(pipeline, automl, full_X_train, full_y_train):
        """Given a pipeline, config and data, train and score the pipeline and return the CV or TV scores

        Arguments:
            pipeline (PipelineBase): The pipeline to score
            automl (AutoMLSearch): The AutoML search, used to access config and for the error callback
            full_X_train (ww.DataTable): Training features
            full_y_train (ww.DataColumn): Training target

        Returns:
            dict: A dict containing cv_score_mean, cv_scores, training_time and a cv_data structure with details.
        """
        start = time.time()
        cv_data = []
        logger.info("\tStarting cross validation")
        X_pd = _convert_woodwork_types_wrapper(full_X_train.to_dataframe())
        y_pd = _convert_woodwork_types_wrapper(full_y_train.to_series())
        for i, (train,
                valid) in enumerate(automl.data_splitter.split(X_pd, y_pd)):
            if pipeline.model_family == ModelFamily.ENSEMBLE and i > 0:
                # Stacked ensembles do CV internally, so we do not run CV here for performance reasons.
                logger.debug(
                    f"Skipping fold {i} because CV for stacked ensembles is not supported."
                )
                break
            logger.debug(f"\t\tTraining and scoring on fold {i}")
            X_train, X_valid = full_X_train.iloc[train], full_X_train.iloc[
                valid]
            y_train, y_valid = full_y_train.iloc[train], full_y_train.iloc[
                valid]
            if is_binary(automl.problem_type) or is_multiclass(
                    automl.problem_type):
                diff_train = set(
                    np.setdiff1d(full_y_train.to_series(),
                                 y_train.to_series()))
                diff_valid = set(
                    np.setdiff1d(full_y_train.to_series(),
                                 y_valid.to_series()))
                diff_string = f"Missing target values in the training set after data split: {diff_train}. " if diff_train else ""
                diff_string += f"Missing target values in the validation set after data split: {diff_valid}." if diff_valid else ""
                if diff_string:
                    raise Exception(diff_string)
            objectives_to_score = [automl.objective
                                   ] + automl.additional_objectives
            cv_pipeline = None
            try:
                X_threshold_tuning = None
                y_threshold_tuning = None
                if automl.optimize_thresholds and automl.objective.is_defined_for_problem_type(automl.problem_type) and \
                   automl.objective.can_optimize_threshold and is_binary(automl.problem_type):
                    X_train, X_threshold_tuning, y_train, y_threshold_tuning = split_data(
                        X_train,
                        y_train,
                        automl.problem_type,
                        test_size=0.2,
                        random_seed=automl.random_seed)
                cv_pipeline = pipeline.clone()
                logger.debug(f"\t\t\tFold {i}: starting training")
                cv_pipeline.fit(X_train, y_train)
                logger.debug(f"\t\t\tFold {i}: finished training")
                tune_binary_threshold(cv_pipeline, automl.objective,
                                      automl.problem_type, X_threshold_tuning,
                                      y_threshold_tuning)
                if X_threshold_tuning:
                    logger.debug(
                        f"\t\t\tFold {i}: Optimal threshold found ({cv_pipeline.threshold:.3f})"
                    )
                logger.debug(f"\t\t\tFold {i}: Scoring trained pipeline")
                scores = cv_pipeline.score(X_valid,
                                           y_valid,
                                           objectives=objectives_to_score)
                logger.debug(
                    f"\t\t\tFold {i}: {automl.objective.name} score: {scores[automl.objective.name]:.3f}"
                )
                score = scores[automl.objective.name]
            except Exception as e:
                if automl.error_callback is not None:
                    automl.error_callback(exception=e,
                                          traceback=traceback.format_tb(
                                              sys.exc_info()[2]),
                                          automl=automl,
                                          fold_num=i,
                                          pipeline=pipeline)
                if isinstance(e, PipelineScoreError):
                    nan_scores = {
                        objective: np.nan
                        for objective in e.exceptions
                    }
                    scores = {**nan_scores, **e.scored_successfully}
                    scores = OrderedDict({
                        o.name: scores[o.name]
                        for o in [automl.objective] +
                        automl.additional_objectives
                    })
                    score = scores[automl.objective.name]
                else:
                    score = np.nan
                    scores = OrderedDict(
                        zip([n.name for n in automl.additional_objectives],
                            [np.nan] * len(automl.additional_objectives)))

            ordered_scores = OrderedDict()
            ordered_scores.update({automl.objective.name: score})
            ordered_scores.update(scores)
            ordered_scores.update({"# Training": y_train.shape[0]})
            ordered_scores.update({"# Validation": y_valid.shape[0]})

            evaluation_entry = {
                "all_objective_scores": ordered_scores,
                "score": score,
                'binary_classification_threshold': None
            }
            if is_binary(
                    automl.problem_type
            ) and cv_pipeline is not None and cv_pipeline.threshold is not None:
                evaluation_entry[
                    'binary_classification_threshold'] = cv_pipeline.threshold
            cv_data.append(evaluation_entry)
        training_time = time.time() - start
        cv_scores = pd.Series([fold['score'] for fold in cv_data])
        cv_score_mean = cv_scores.mean()
        logger.info(
            f"\tFinished cross validation - mean {automl.objective.name}: {cv_score_mean:.3f}"
        )
        return {
            'cv_data': cv_data,
            'training_time': training_time,
            'cv_scores': cv_scores,
            'cv_score_mean': cv_score_mean
        }