def _make_stacked_ensemble_pipeline(input_pipelines, problem_type, n_jobs=-1, random_seed=0): """ Creates a pipeline with a stacked ensemble estimator. Arguments: input_pipelines (list(PipelineBase or subclass obj)): List of pipeline instances to use as the base estimators for the stacked ensemble. This must not be None or an empty list or else EnsembleMissingPipelinesError will be raised. problem_type (ProblemType): problem type of pipeline n_jobs (int or None): Integer describing level of parallelism used for pipelines. None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Defaults to -1. Returns: Pipeline with appropriate stacked ensemble estimator. """ parameters = {} if is_classification(problem_type): parameters = {"Stacked Ensemble Classifier": {"input_pipelines": input_pipelines, "n_jobs": n_jobs}} estimator = StackedEnsembleClassifier else: parameters = {"Stacked Ensemble Regressor": {"input_pipelines": input_pipelines, "n_jobs": n_jobs}} estimator = StackedEnsembleRegressor pipeline_class, pipeline_name = { ProblemTypes.BINARY: (BinaryClassificationPipeline, "Stacked Ensemble Classification Pipeline"), ProblemTypes.MULTICLASS: (MulticlassClassificationPipeline, "Stacked Ensemble Classification Pipeline"), ProblemTypes.REGRESSION: (RegressionPipeline, "Stacked Ensemble Regression Pipeline")}[problem_type] return pipeline_class([estimator], parameters=parameters, custom_name=pipeline_name, random_seed=random_seed)
def split_data(X, y, problem_type, problem_configuration=None, test_size=.2, random_seed=0): """Splits data into train and test sets. Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series, or np.ndarray): target data of length [n_samples] problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list. problem_configuration (dict): Additional parameters needed to configure the search. For example, in time series problems, values should be passed in for the date_index, gap, and max_delay variables. test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%). random_seed (int): Seed for the random number generator. Defaults to 0. Returns: ww.DataTable, ww.DataTable, ww.DataColumn, ww.DataColumn: Feature and target data each split into train and test sets """ X = infer_feature_types(X) y = infer_feature_types(y) data_splitter = None if is_time_series(problem_type): data_splitter = TrainingValidationSplit(test_size=test_size, shuffle=False, stratify=None, random_seed=random_seed) elif is_regression(problem_type): data_splitter = ShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed) elif is_classification(problem_type): data_splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed) train, test = next(data_splitter.split(X.to_dataframe(), y.to_series())) X_train = X.iloc[train] X_test = X.iloc[test] y_train = y.iloc[train] y_test = y.iloc[test] return X_train, X_test, y_train, y_test
def test_type_checks(problem_type): assert is_regression(problem_type) == (problem_type in [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION ]) assert is_binary(problem_type) == (problem_type in [ ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY ]) assert is_multiclass(problem_type) == (problem_type in [ ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS ]) assert is_classification(problem_type) == (problem_type in [ ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS ]) assert is_time_series(problem_type) == (problem_type in [ ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_REGRESSION ])
def make_pipeline(X, y, estimator, problem_type, parameters=None, custom_hyperparameters=None, sampler_name=None): """Given input data, target data, an estimator class and the problem type, generates a pipeline class with a preprocessing chain which was recommended based on the inputs. The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. Arguments: X (pd.DataFrame, ww.DataTable): The input data of shape [n_samples, n_features] y (pd.Series, ww.DataColumn): The target data of length [n_samples] estimator (Estimator): Estimator for pipeline problem_type (ProblemTypes or str): Problem type for pipeline to generate parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary or None implies using all default values for component parameters. custom_hyperparameters (dictionary): Dictionary of custom hyperparameters, with component name as key and dictionary of parameters as the value sampler_name (str): The name of the sampler component to add to the pipeline. Only used in classification problems. Defaults to None Returns: PipelineBase object: PipelineBase instance with dynamically generated preprocessing components and specified estimator """ X = infer_feature_types(X) y = infer_feature_types(y) problem_type = handle_problem_types(problem_type) if estimator not in get_estimators(problem_type): raise ValueError(f"{estimator.name} is not a valid estimator for problem type") if not is_classification(problem_type) and sampler_name is not None: raise ValueError(f"Sampling is unsupported for problem_type {str(problem_type)}") preprocessing_components = _get_preprocessing_components(X, y, problem_type, estimator, sampler_name) complete_component_graph = preprocessing_components + [estimator] if custom_hyperparameters and not isinstance(custom_hyperparameters, dict): raise ValueError(f"if custom_hyperparameters provided, must be dictionary. Received {type(custom_hyperparameters)}") base_class = _get_pipeline_base_class(problem_type) return base_class(complete_component_graph, parameters=parameters, custom_hyperparameters=custom_hyperparameters)
def train_and_score_pipeline(pipeline, automl, full_X_train, full_y_train): """Given a pipeline, config and data, train and score the pipeline and return the CV or TV scores Arguments: pipeline (PipelineBase): The pipeline to score automl (AutoMLSearch): The AutoML search, used to access config and for the error callback full_X_train (ww.DataTable): Training features full_y_train (ww.DataColumn): Training target Returns: dict: A dict containing cv_score_mean, cv_scores, training_time and a cv_data structure with details. """ start = time.time() cv_data = [] logger.info("\tStarting cross validation") X_pd = _convert_woodwork_types_wrapper(full_X_train.to_dataframe()) y_pd = _convert_woodwork_types_wrapper(full_y_train.to_series()) y_pd_encoded = y_pd # Encode target for classification problems so that we can support float targets. This is okay because we only use split to get the indices to split on if is_classification(automl.problem_type): y_mapping = { original_target: encoded_target for (encoded_target, original_target) in enumerate(y_pd.value_counts().index) } y_pd_encoded = y_pd.map(y_mapping) for i, (train, valid) in enumerate( automl.data_splitter.split(X_pd, y_pd_encoded)): if pipeline.model_family == ModelFamily.ENSEMBLE and i > 0: # Stacked ensembles do CV internally, so we do not run CV here for performance reasons. logger.debug( f"Skipping fold {i} because CV for stacked ensembles is not supported." ) break logger.debug(f"\t\tTraining and scoring on fold {i}") X_train, X_valid = full_X_train.iloc[train], full_X_train.iloc[ valid] y_train, y_valid = full_y_train.iloc[train], full_y_train.iloc[ valid] if is_binary(automl.problem_type) or is_multiclass( automl.problem_type): diff_train = set( np.setdiff1d(full_y_train.to_series(), y_train.to_series())) diff_valid = set( np.setdiff1d(full_y_train.to_series(), y_valid.to_series())) diff_string = f"Missing target values in the training set after data split: {diff_train}. " if diff_train else "" diff_string += f"Missing target values in the validation set after data split: {diff_valid}." if diff_valid else "" if diff_string: raise Exception(diff_string) objectives_to_score = [automl.objective ] + automl.additional_objectives cv_pipeline = None try: logger.debug(f"\t\t\tFold {i}: starting training") cv_pipeline = EngineBase.train_pipeline( pipeline, X_train, y_train, automl.optimize_thresholds, automl.objective) logger.debug(f"\t\t\tFold {i}: finished training") if automl.optimize_thresholds and pipeline.can_tune_threshold_with_objective( automl.objective ) and automl.objective.can_optimize_threshold: logger.debug( f"\t\t\tFold {i}: Optimal threshold found ({cv_pipeline.threshold:.3f})" ) logger.debug(f"\t\t\tFold {i}: Scoring trained pipeline") scores = cv_pipeline.score(X_valid, y_valid, objectives=objectives_to_score) logger.debug( f"\t\t\tFold {i}: {automl.objective.name} score: {scores[automl.objective.name]:.3f}" ) score = scores[automl.objective.name] except Exception as e: if automl.error_callback is not None: automl.error_callback(exception=e, traceback=traceback.format_tb( sys.exc_info()[2]), automl=automl, fold_num=i, pipeline=pipeline) if isinstance(e, PipelineScoreError): nan_scores = { objective: np.nan for objective in e.exceptions } scores = {**nan_scores, **e.scored_successfully} scores = OrderedDict({ o.name: scores[o.name] for o in [automl.objective] + automl.additional_objectives }) score = scores[automl.objective.name] else: score = np.nan scores = OrderedDict( zip([n.name for n in automl.additional_objectives], [np.nan] * len(automl.additional_objectives))) ordered_scores = OrderedDict() ordered_scores.update({automl.objective.name: score}) ordered_scores.update(scores) ordered_scores.update({"# Training": y_train.shape[0]}) ordered_scores.update({"# Validation": y_valid.shape[0]}) evaluation_entry = { "all_objective_scores": ordered_scores, "score": score, 'binary_classification_threshold': None } if is_binary( automl.problem_type ) and cv_pipeline is not None and cv_pipeline.threshold is not None: evaluation_entry[ 'binary_classification_threshold'] = cv_pipeline.threshold cv_data.append(evaluation_entry) training_time = time.time() - start cv_scores = pd.Series([fold['score'] for fold in cv_data]) cv_score_mean = cv_scores.mean() logger.info( f"\tFinished cross validation - mean {automl.objective.name}: {cv_score_mean:.3f}" ) return { 'cv_data': cv_data, 'training_time': training_time, 'cv_scores': cv_scores, 'cv_score_mean': cv_score_mean }