def _fit_transform_features_helper(self, needs_fitting, X, y=None): """Helper function that transforms the input data based on the component graph components. Arguments: needs_fitting (boolean): Determines if components should be fit. X (ww.DataTable, pd.DataFrame): Data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series): The target training data of length [n_samples]. Defaults to None. Returns: ww.DataTable: Transformed values. """ if len(self.compute_order) <= 1: return infer_feature_types(X) component_outputs = self._compute_features(self.compute_order[:-1], X, y=y, fit=needs_fitting) final_component_inputs = [] for parent in self.get_parents(self.compute_order[-1]): parent_output = component_outputs.get( parent, component_outputs.get(f'{parent}.x')) if isinstance(parent_output, ww.DataColumn): parent_output = parent_output.to_series() parent_output = pd.DataFrame(parent_output, columns=[parent]) parent_output = infer_feature_types(parent_output) final_component_inputs.append(parent_output) concatted = pd.concat([ component_input.to_dataframe() for component_input in final_component_inputs ], axis=1) if needs_fitting: self.input_feature_names.update( {self.compute_order[-1]: list(concatted.columns)}) return infer_feature_types(concatted)
def validate(self, X, y): """Check if the target or any of the features have no variance (1 unique value). Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): The input features. y (ww.DataColumn, pd.Series, np.ndarray): The target data. Returns: dict: dict of warnings/errors corresponding to features or target with no variance. """ results = { "warnings": [], "errors": [], "actions": [] } X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) unique_counts = X.nunique(dropna=self._dropnan).to_dict() any_nulls = (X.isnull().any()).to_dict() for name in unique_counts: message = self._check_for_errors(name, unique_counts[name], any_nulls[name]) if not message: continue DataCheck._add_message(message, results) y_name = getattr(y, "name") if not y_name: y_name = "Y" target_message = self._check_for_errors(y_name, y.nunique(dropna=self._dropnan), y.isnull().any()) if target_message: DataCheck._add_message(target_message, results) return results
def load_data(path, index, target, n_rows=None, drop=None, verbose=True, **kwargs): """Load features and target from file. Arguments: path (str): Path to file or a http/ftp/s3 URL index (str): Column for index target (str): Column for target n_rows (int): Number of rows to return drop (list): List of columns to drop verbose (bool): If True, prints information about features and target Returns: ww.DataTable, ww.DataColumn: Features matrix and target """ feature_matrix = pd.read_csv(path, index_col=index, nrows=n_rows, **kwargs) targets = [target] + (drop or []) y = feature_matrix[target] X = feature_matrix.drop(columns=targets) if verbose: # number of features print(number_of_features(X.dtypes), end='\n\n') # number of total training examples info = 'Number of training examples: {}' print(info.format(len(X)), end='\n') # target distribution print(target_distribution(y)) X = infer_feature_types(X) y = infer_feature_types(y) return X, y
def transform(self, X, y): """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same. Arguments: X (ww.DataTable, pd.DataFrame): Features. Ignored. y (ww.DataColumn, pd.Series): Target data to impute. Returns: (ww.DataTable, ww.DataColumn): The original X, transformed y """ if X is not None: X = infer_feature_types(X) if y is None: return X, None y_ww = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y_ww.to_series()) y_df = y.to_frame() # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool if (y_df.dtypes == bool).all(): return X, _retain_custom_types_and_initalize_woodwork(y_ww, y) transformed = self._component_obj.transform(y_df) if transformed.shape[1] == 0: raise RuntimeError("Transformed data is empty") y_t = pd.Series(transformed[:, 0], index=y.index) return X, _retain_custom_types_and_initalize_woodwork(y_ww, y_t)
def transform(self, X, y=None): """Transforms input by imputing missing values. 'None' and np.nan values are treated as the same. Arguments: X (ww.DataTable, pd.DataFrame): Data to transform y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X """ X_ww = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool if (X.dtypes == bool).all(): return infer_feature_types(X) X_null_dropped = X.copy() X_null_dropped.drop(self._all_null_cols, axis=1, errors='ignore', inplace=True) X_t = self._component_obj.transform(X) if X_null_dropped.empty: X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns) return infer_feature_types(X_t) X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns) X_t.index = X_null_dropped.index return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def split_data(X, y, problem_type, problem_configuration=None, test_size=.2, random_seed=0): """Splits data into train and test sets. Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series, or np.ndarray): target data of length [n_samples] problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list. problem_configuration (dict): Additional parameters needed to configure the search. For example, in time series problems, values should be passed in for the date_index, gap, and max_delay variables. test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%). random_seed (int): Seed for the random number generator. Defaults to 0. Returns: ww.DataTable, ww.DataTable, ww.DataColumn, ww.DataColumn: Feature and target data each split into train and test sets """ X = infer_feature_types(X) y = infer_feature_types(y) data_splitter = None if is_time_series(problem_type): data_splitter = TrainingValidationSplit(test_size=test_size, shuffle=False, stratify=None, random_seed=random_seed) elif is_regression(problem_type): data_splitter = ShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed) elif is_classification(problem_type): data_splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed) train, test = next(data_splitter.split(X.to_dataframe(), y.to_series())) X_train = X.iloc[train] X_test = X.iloc[test] y_train = y.iloc[train] y_test = y.iloc[test] return X_train, X_test, y_train, y_test
def predict(self, X, y=None, objective=None): """Make predictions using selected features. Arguments: X (ww.DataTable, pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series, np.ndarray, None): The target training targets of length [n_samples] objective (Object or string): The objective to use to make predictions Returns: ww.DataColumn: Predicted values. """ if X is None: X = pd.DataFrame() X = infer_feature_types(X) y = infer_feature_types(y) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) features = self.compute_estimator_features(X, y) features = _convert_woodwork_types_wrapper(features.to_dataframe()) features_no_nan, y = drop_rows_with_nans(features, y) y_arg = None if self.estimator.predict_uses_y: y_arg = y predictions = self.estimator.predict(features_no_nan, y_arg).to_series() predictions = predictions.rename(self.input_target_name) padded = pad_with_nans( predictions, max(0, features.shape[0] - predictions.shape[0])) return infer_feature_types(padded)
def transform(self, X, y=None): """Transforms data X by creating new features using existing DateTime columns, and then dropping those DateTime columns Arguments: X (ww.DataTable, pd.DataFrame): Data to transform y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X """ X_ww = infer_feature_types(X) X_t = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) features_to_extract = self.parameters["features_to_extract"] if len(features_to_extract) == 0: return infer_feature_types(X_t) for col_name in self._date_time_col_names: for feature in features_to_extract: name = f"{col_name}_{feature}" features, categories = self._function_mappings[feature]( X_t[col_name], self.encode_as_categories) X_t[name] = features if categories: self._categories[name] = categories X_t = X_t.drop(self._date_time_col_names, axis=1) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] y (pd.Series, ww.DataColumn): True labels of length [n_samples] objectives (list): Non-empty list of objectives to score on Returns: dict: Ordered dictionary of objective scores """ # Only converting X for the call to _score_all_objectives if X is None: X = pd.DataFrame() X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) y_predicted = self.predict(X, y) y_predicted = _convert_woodwork_types_wrapper(y_predicted.to_series()) y_shifted = y.shift(-self.gap) objectives = self.create_objectives(objectives) y_shifted, y_predicted = drop_rows_with_nans(y_shifted, y_predicted) return self._score_all_objectives(X, y_shifted, y_predicted, y_pred_proba=None, objectives=objectives)
def fit(self, X, y): """Fit a time series regression pipeline. Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series, np.ndarray): The target training targets of length [n_samples] Returns: self """ if X is None: X = pd.DataFrame() X = infer_feature_types(X) y = infer_feature_types(y) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) X_t = self._compute_features_during_fit(X, y) X_t = X_t.to_dataframe() y_shifted = y.shift(-self.gap) X_t, y_shifted = drop_rows_with_nans(X_t, y_shifted) self.estimator.fit(X_t, y_shifted) self.input_feature_names = self._component_graph.input_feature_names return self
def test_infer_feature_types_dataframe(): X_pd = pd.DataFrame({0: pd.Series([1, 2]), 1: pd.Series([3, 4])}) pd.testing.assert_frame_equal(X_pd, infer_feature_types(X_pd).to_dataframe(), check_dtype=False) X_pd = pd.DataFrame({ 0: pd.Series([1, 2], dtype="Int64"), 1: pd.Series([3, 4], dtype="Int64") }) pd.testing.assert_frame_equal(X_pd, infer_feature_types(X_pd).to_dataframe()) X_expected = X_pd.copy() X_expected[0] = X_expected[0].astype("category") pd.testing.assert_frame_equal( X_expected, infer_feature_types(X_pd, { 0: "categorical" }).to_dataframe()) pd.testing.assert_frame_equal( X_expected, infer_feature_types(X_pd, { 0: ww.logical_types.Categorical }).to_dataframe())
def validate(self, X, y=None): """ Inspects and validates the input data against data checks and returns a list of warnings and errors if applicable. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): The input data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series, np.ndarray): The target data of length [n_samples] Returns: dict: Dictionary containing DataCheckMessage objects """ messages = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) X = X.drop(list(X.select('index').columns)) if y is not None: y = infer_feature_types(y) for data_check in self.data_checks: messages_new = data_check.validate(X, y) messages["warnings"].extend(messages_new["warnings"]) messages["errors"].extend(messages_new["errors"]) new_actions = messages_new["actions"] for new_action in new_actions: if new_action not in messages["actions"]: messages["actions"].append(new_action) return messages
def _manage_woodwork(self, X, y=None): """Function to convert the input and target data to Pandas data structures.""" X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) if y is not None: y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) return X, y
def predict_proba(self, X, y=None): if y is None: raise ValueError("Cannot predict Time Series Baseline Estimator if y is None") y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) preds = self.predict(X, y).to_series().dropna(axis=0, how='any').astype('int') proba_arr = np.zeros((len(preds), y.max() + 1)) proba_arr[np.arange(len(preds)), preds] = 1 padded = pad_with_nans(pd.DataFrame(proba_arr), len(y) - len(preds)) return infer_feature_types(padded)
def predict(self, X, y=None): if y is None: raise ValueError("Cannot predict Time Series Baseline Estimator if y is None") y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) if self.gap == 0: y = y.shift(periods=1) return infer_feature_types(y)
def predict(self, X): X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) predictions = self._component_obj.predict(X) if predictions.ndim == 2 and predictions.shape[1] == 1: predictions = predictions.flatten() if self._label_encoder: predictions = self._label_encoder.inverse_transform( predictions.astype(np.int64)) return infer_feature_types(predictions)
def transform(self, X, y=None): X_ww = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) if y is not None: y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) X_t = self._component_obj.transform(X, y) X_t_df = pd.DataFrame(X_t, columns=X.columns, index=X.index) return _retain_custom_types_and_initalize_woodwork( X_ww, X_t_df, ltypes_to_ignore=[Categorical])
def fit_transform(self, X, y=None): X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): raise ValueError("LDA input must be all numeric") y = infer_feature_types(y) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) X_t = self._component_obj.fit_transform(X, y) X_t = pd.DataFrame(X_t, index=X.index, columns=[f"component_{i}" for i in range(X_t.shape[1])]) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def predict(self, X): X = infer_feature_types(X) strategy = self.parameters["strategy"] if strategy == "mode": predictions = pd.Series([self._mode] * len(X)) elif strategy == "random": predictions = get_random_state(self.random_seed).choice( self._classes, len(X)) else: predictions = get_random_state(self.random_seed).choice( self._classes, len(X), p=self._percentage_freq) return infer_feature_types(predictions)
def transform(self, X, y=None): """Computes the delayed features for all features in X and y. For each feature in X, it will add a column to the output dataframe for each delay in the (inclusive) range [1, max_delay]. The values of each delayed feature are simply the original feature shifted forward in time by the delay amount. For example, a delay of 3 units means that the feature value at row n will be taken from the n-3rd row of that feature If y is not None, it will also compute the delayed values for the target variable. Arguments: X (ww.DataTable, pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used. y (ww.DataColumn, pd.Series, or None): Target. Returns: ww.DataTable: Transformed X. """ if X is None: X = pd.DataFrame() # Normalize the data into pandas objects X_ww = infer_feature_types(X) categorical_columns = self._get_categorical_columns(X_ww) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) if self.delay_features and len(X) > 0: X_categorical = self._encode_X_while_preserving_index( X[categorical_columns]) for col_name in X: col = X[col_name] if col_name in categorical_columns: col = X_categorical[col_name] X = X.assign( **{ f"{col_name}_delay_{t}": col.shift(t) for t in range(1, self.max_delay + 1) }) # Handle cases where the target was passed in if self.delay_target and y is not None: y = infer_feature_types(y) if y.logical_type == logical_types.Categorical: y = self._encode_y_while_preserving_index(y) else: y = _convert_woodwork_types_wrapper(y.to_series()) X = X.assign( **{ f"target_delay_{t}": y.shift(t) for t in range(self.start_delay_for_target, self.max_delay + 1) }) return _retain_custom_types_and_initalize_woodwork(X_ww, X)
def fit(self, X, y=None): if y is None: raise ValueError("Cannot fit Baseline regressor if y is None") X = infer_feature_types(X) y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) if self.parameters["strategy"] == "mean": self._prediction_value = y.mean() elif self.parameters["strategy"] == "median": self._prediction_value = y.median() self._num_features = X.shape[1] return self
def predict(self, X): """Make predictions using selected features. Arguments: X (ww.DataTable, pd.DataFrame): Data of shape [n_samples, n_features] Returns: ww.DataColumn: Predicted values. """ if len(self.compute_order) == 0: return infer_feature_types(X) final_component = self.compute_order[-1] outputs = self._compute_features(self.compute_order, X) return infer_feature_types(outputs.get(final_component, outputs.get(f'{final_component}.x')))
def test_search(mock_automl_search, mock_data_checks_validate, X_y_binary): X, y = X_y_binary # this doesn't exactly match the data check results schema but its enough to trigger the error in search() data_check_results_expected = {'warnings': ['Warning 1', 'Warning 2']} mock_data_checks_validate.return_value = data_check_results_expected automl, data_check_results = search(X_train=X, y_train=y, problem_type='binary') assert isinstance(automl, AutoMLSearch) assert data_check_results is data_check_results_expected mock_data_checks_validate.assert_called_once() mock_data_checks_validate.assert_called_with(infer_feature_types(X), y=infer_feature_types(y)) mock_automl_search.assert_called_once()
def fit(self, X, y): X = infer_feature_types(X) if not is_all_numeric(X): raise ValueError("LDA input must be all numeric") y = infer_feature_types(y) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) n_features = X.shape[1] n_classes = y.nunique() n_components = self.parameters['n_components'] if n_components is not None and n_components > min(n_classes, n_features): raise ValueError(f"n_components value {n_components} is too large") self._component_obj.fit(X, y) return self
def predict(self, X, objective=None): """Make predictions using selected features. Arguments: X (ww.DataTable, pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] objective (Object or string): The objective to use to make predictions Returns: ww.DataColumn: Predicted values. """ X = infer_feature_types(X) predictions = self._component_graph.predict(X) predictions_series = predictions.to_series() predictions_series.name = self.input_target_name return infer_feature_types(predictions_series)
def test_infer_feature_types_series(): X_pd = pd.Series([1, 2, 3, 4]) X_expected = X_pd.astype("Int64") pd.testing.assert_series_equal(X_expected, infer_feature_types(X_pd).to_series()) X_pd = pd.Series([1, 2, 3, 4], dtype="Int64") pd.testing.assert_series_equal(X_pd, infer_feature_types(X_pd).to_series()) X_pd = pd.Series([1, 2, 3, 4], dtype="Int64") X_expected = X_pd.astype("category") pd.testing.assert_series_equal(X_expected, infer_feature_types(X_pd, "categorical").to_series()) X_pd = pd.Series([1, 2, 3, 4], dtype="Int64") X_expected = X_pd.astype("category") pd.testing.assert_series_equal(X_expected, infer_feature_types(X_pd, ww.logical_types.Categorical).to_series())
def test_search_data_check_error(mock_automl_search, mock_data_checks_validate, X_y_binary): X, y = X_y_binary # this doesn't exactly match the data check results schema but its enough to trigger the error in search() data_check_results_expected = {'errors': ['Error 1', 'Error 2']} mock_data_checks_validate.return_value = data_check_results_expected automl, data_check_results = search(X_train=X, y_train=y, problem_type='binary') assert automl is None assert data_check_results == data_check_results_expected mock_data_checks_validate.assert_called_once() mock_data_checks_validate.assert_called_with(infer_feature_types(X), y=infer_feature_types(y)) mock_automl_search.assert_not_called()
def transform(self, X, y=None): """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are treated as the same. Arguments: X (ww.DataTable, pd.DataFrame): Data to transform y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X """ X_ww = infer_feature_types(X) X_null_dropped = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) X_null_dropped.drop(self._all_null_cols, inplace=True, axis=1, errors='ignore') if X_null_dropped.empty: return _retain_custom_types_and_initalize_woodwork( X_ww, X_null_dropped) if self._numeric_cols is not None and len(self._numeric_cols) > 0: X_numeric = X_null_dropped[self._numeric_cols] imputed = self._numeric_imputer.transform(X_numeric).to_dataframe() X_null_dropped[X_numeric.columns] = imputed if self._categorical_cols is not None and len( self._categorical_cols) > 0: X_categorical = X_null_dropped[self._categorical_cols] imputed = self._categorical_imputer.transform( X_categorical).to_dataframe() X_null_dropped[X_categorical.columns] = imputed X_null_dropped = _retain_custom_types_and_initalize_woodwork( X_ww, X_null_dropped) return X_null_dropped
def test_more_top_n_unique_values_large(): X = pd.DataFrame({ "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i"], "col_2": ["a", "a", "a", "b", "b", "c", "c", "d", "e"], "col_3": ["a", "a", "a", "b", "b", "b", "c", "c", "d"], "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1] }) random_seed = 2 encoder = OneHotEncoder(top_n=3, random_seed=random_seed) encoder.fit(X) X_t = encoder.transform(X) # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed) col_1_counts = col_1_counts.sort_values(["col_1"], ascending=False, kind='mergesort') col_1_samples = col_1_counts.head( encoder.parameters['top_n']).index.tolist() expected_col_names = set([ "col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c", "col_4" ]) for val in col_1_samples: expected_col_names.add("col_1_" + val) col_names = set(X_t.columns) assert (col_names == expected_col_names)
def _predict(self, X, y, objective=None, pad=False): features = self.compute_estimator_features(X, y) features = _convert_woodwork_types_wrapper(features.to_dataframe()) features_no_nan, y_no_nan = drop_rows_with_nans(features, y) if objective is not None: objective = get_objective(objective, return_instance=True) if not objective.is_defined_for_problem_type(self.problem_type): raise ValueError( f"Objective {objective.name} is not defined for time series binary classification." ) if self.threshold is None: predictions = self._estimator_predict(features_no_nan, y_no_nan).to_series() else: proba = self._estimator_predict_proba(features_no_nan, y_no_nan).to_dataframe() proba = proba.iloc[:, 1] if objective is None: predictions = proba > self.threshold else: predictions = objective.decision_function( proba, threshold=self.threshold, X=features_no_nan) if pad: predictions = pad_with_nans( predictions, max(0, features.shape[0] - predictions.shape[0])) return infer_feature_types(predictions)