def predict(self, X, y=None, objective=None): """Make predictions using selected features. Arguments: X (ww.DataTable, pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series, np.ndarray, None): The target training targets of length [n_samples] objective (Object or string): The objective to use to make predictions Returns: pd.Series: Predicted values. """ if X is None: X = pd.DataFrame() X = _convert_to_woodwork_structure(X) y = _convert_to_woodwork_structure(y) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) features = self.compute_estimator_features(X, y) features_no_nan, y = drop_rows_with_nans(features, y) y_arg = None if self.estimator.predict_uses_y: y_arg = y predictions = self.estimator.predict(features_no_nan, y_arg) predictions = predictions.rename(self.input_target_name) return pad_with_nans(predictions, max(0, features.shape[0] - predictions.shape[0]))
def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] y (pd.Series, ww.DataColumn): True labels of length [n_samples] objectives (list): Non-empty list of objectives to score on Returns: dict: Ordered dictionary of objective scores """ # Only converting X for the call to _score_all_objectives if X is None: X = pd.DataFrame() X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_to_woodwork_structure(y) y = _convert_woodwork_types_wrapper(y.to_series()) y_predicted = self.predict(X, y) y_shifted = y.shift(-self.gap) objectives = [ get_objective(o, return_instance=True) for o in objectives ] y_shifted, y_predicted = drop_rows_with_nans(y_shifted, y_predicted) return self._score_all_objectives(X, y_shifted, y_predicted, y_pred_proba=None, objectives=objectives)
def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series): True labels of length [n_samples] objectives (list): Non-empty list of objectives to score on Returns: dict: Ordered dictionary of objective scores """ X, y = self._convert_to_woodwork(X, y) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) objectives = [get_objective(o, return_instance=True) for o in objectives] y_encoded = self._encode_targets(y) y_shifted = y_encoded.shift(-self.gap) y_predicted, y_predicted_proba = self._compute_predictions(X, y, objectives, time_series=True) if y_predicted is not None: y_predicted = _convert_woodwork_types_wrapper(y_predicted.to_series()) if y_predicted_proba is not None: y_predicted_proba = _convert_woodwork_types_wrapper(y_predicted_proba.to_dataframe()) y_shifted, y_predicted, y_predicted_proba = drop_rows_with_nans(y_shifted, y_predicted, y_predicted_proba) return self._score_all_objectives(X, y_shifted, y_predicted, y_pred_proba=y_predicted_proba, objectives=objectives)
def validate(self, X, y): """Check if the target or any of the features have no variance (1 unique value). Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): The input features. y (ww.DataColumn, pd.Series, np.ndarray): The target data. Returns: dict: dict of warnings/errors corresponding to features or target with no variance. """ messages = {"warnings": [], "errors": []} X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_to_woodwork_structure(y) y = _convert_woodwork_types_wrapper(y.to_series()) unique_counts = X.nunique(dropna=self._dropnan).to_dict() any_nulls = (X.isnull().any()).to_dict() for name in unique_counts: message = self._check_for_errors(name, unique_counts[name], any_nulls[name]) if not message: continue DataCheck._add_message(message, messages) y_name = getattr(y, "name") if not y_name: y_name = "Y" target_message = self._check_for_errors( y_name, y.nunique(dropna=self._dropnan), y.isnull().any()) if target_message: DataCheck._add_message(target_message, messages) return messages
def fit_transform(self, X, y=None): """Fits on X and transforms X Arguments: X (pd.DataFrame): Data to fit and transform y (pd. DataFrame): Target data Returns: pd.DataFrame: Transformed X """ try: X2 = _convert_to_woodwork_structure(X) y2 = _convert_to_woodwork_structure(y) X2 = _convert_woodwork_types_wrapper(X2.to_dataframe()) y2 = _convert_woodwork_types_wrapper(y2.to_series()) X_t = self._component_obj.fit_transform(X2, y2) except AttributeError: try: self.fit(X, y) X_t = self.transform(X, y) except MethodPropertyNotFoundError as e: raise e if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): return pd.DataFrame(X_t, columns=X.columns, index=X.index) return pd.DataFrame(X_t)
def _manage_woodwork(self, X, y=None): """Function to convert the input and target data to Pandas data structures.""" X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) if y is not None: y = _convert_to_woodwork_structure(y) y = _convert_woodwork_types_wrapper(y.to_series()) return X, y
def _calculate_pearson(self, X, y): highly_corr_cols = [] X_num = X.select(include=numeric_and_boolean_ww) if y.logical_type not in numeric_and_boolean_ww or len(X_num.columns) == 0: return highly_corr_cols X_num = _convert_woodwork_types_wrapper(X_num.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) highly_corr_cols = [label for label, col in X_num.iteritems() if abs(y.corr(col)) >= self.pct_corr_threshold] return highly_corr_cols
def fit(self, X, y=None): X = _convert_to_woodwork_structure(X) cat_cols = list(X.select('category').columns) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_to_woodwork_structure(y) y = _convert_woodwork_types_wrapper(y.to_series()) self._component_obj.fit(X, y, silent=True, cat_features=cat_cols) return self
def transform(self, X, y=None): """Computes the delayed features for all features in X and y. For each feature in X, it will add a column to the output dataframe for each delay in the (inclusive) range [1, max_delay]. The values of each delayed feature are simply the original feature shifted forward in time by the delay amount. For example, a delay of 3 units means that the feature value at row n will be taken from the n-3rd row of that feature If y is not None, it will also compute the delayed values for the target variable. Arguments: X (ww.DataTable, pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used. y (ww.DataColumn, pd.Series, or None): Target. Returns: ww.DataTable: Transformed X. """ if X is None: X = pd.DataFrame() # Normalize the data into pandas objects X = _convert_to_woodwork_structure(X) categorical_columns = self._get_categorical_columns(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) if self.delay_features and len(X) > 0: X_categorical = self._encode_X_while_preserving_index( X[categorical_columns]) for col_name in X: col = X[col_name] if col_name in categorical_columns: col = X_categorical[col_name] X = X.assign( **{ f"{col_name}_delay_{t}": col.shift(t) for t in range(1, self.max_delay + 1) }) # Handle cases where the target was passed in if self.delay_target and y is not None: y = _convert_to_woodwork_structure(y) if y.logical_type == logical_types.Categorical: y = self._encode_y_while_preserving_index(y) else: y = _convert_woodwork_types_wrapper(y.to_series()) X = X.assign( **{ f"target_delay_{t}": y.shift(t) for t in range(self.start_delay_for_target, self.max_delay + 1) }) return _convert_to_woodwork_structure(X)
def fit(self, X, y=None): X = _convert_to_woodwork_structure(X) cat_cols = list(X.select('category').columns) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_to_woodwork_structure(y) y = _convert_woodwork_types_wrapper(y.to_series()) # For binary classification, catboost expects numeric values, so encoding before. if y.nunique() <= 2: self._label_encoder = LabelEncoder() y = pd.Series(self._label_encoder.fit_transform(y)) self._component_obj.fit(X, y, silent=True, cat_features=cat_cols) return self
def fit_transform(self, X, y=None): X = _convert_to_woodwork_structure(X) if not is_all_numeric(X): raise ValueError("LDA input must be all numeric") y = _convert_to_woodwork_structure(y) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) X_t = self._component_obj.fit_transform(X, y) return pd.DataFrame( X_t, index=X.index, columns=[f"component_{i}" for i in range(X_t.shape[1])])
def validate(self, X, y): """Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. If `method='mutual'`, supports all target and feature types. Otherwise, if `method='pearson'` only supports binary with numeric and boolean dtypes. Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check y (ww.DataColumn, pd.Series, np.ndarray): The target data Returns: dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected. Example: >>> import pandas as pd >>> X = pd.DataFrame({ ... 'leak': [10, 42, 31, 51, 61], ... 'x': [42, 54, 12, 64, 12], ... 'y': [13, 5, 13, 74, 24], ... }) >>> y = pd.Series([10, 42, 31, 51, 40]) >>> target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.95) >>> assert target_leakage_check.validate(X, y) == {"warnings": [{"message": "Column 'leak' is 95.0% or more correlated with the target",\ "data_check_name": "TargetLeakageDataCheck",\ "level": "warning",\ "code": "TARGET_LEAKAGE",\ "details": {"column": "leak"}}],\ "errors": []} """ messages = { "warnings": [], "errors": [] } X = _convert_to_woodwork_structure(X) y = _convert_to_woodwork_structure(y) if self.method == 'pearson': highly_corr_cols = self._calculate_pearson(X, y) else: X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) highly_corr_cols = self._calculate_mutual_information(X, y) warning_msg = "Column '{}' is {}% or more correlated with the target" messages["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name, self.pct_corr_threshold * 100), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": col_name}).to_dict() for col_name in highly_corr_cols]) return messages
def fit(self, X, y=None): X_encoded = self._encode_categories(X, fit=True) if y is not None: y = _convert_to_woodwork_structure(y) y = _convert_woodwork_types_wrapper(y.to_series()) self._component_obj.fit(X_encoded, y) return self
def transform(self, X, y=None): """One-hot encode the input data. Arguments: X (ww.DataTable, pd.DataFrame): Features to one-hot encode. y (ww.DataColumn, pd.Series): Ignored. Returns: ww.DataTable: Transformed data, where each categorical feature has been encoded into numerical columns using one-hot encoding. """ X_copy = _convert_to_woodwork_structure(X) X_copy = _convert_woodwork_types_wrapper(X_copy.to_dataframe()) X_copy = self._handle_parameter_handle_missing(X_copy) X_t = pd.DataFrame() # Add the non-categorical columns, untouched for col in X_copy.columns: if col not in self.features_to_encode: X_t = pd.concat([X_t, X_copy[col]], axis=1) # The call to pd.concat above changes the type of the index so we will manually keep it the same. if not X_t.empty: X_t.index = X_copy.index # Call sklearn's transform on the categorical columns if len(self.features_to_encode) > 0: X_cat = pd.DataFrame(self._encoder.transform(X_copy[self.features_to_encode]).toarray(), index=X_copy.index) X_cat.columns = self.get_feature_names() X_t = pd.concat([X_t, X_cat], axis=1) return _convert_to_woodwork_structure(X_t)
def transform(self, X, y=None): """Transforms data X by creating new features using existing text columns Arguments: X (ww.DataTable, pd.DataFrame): Data to transform y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X """ X = _convert_to_woodwork_structure(X) if self._features is None or len(self._features) == 0: return X X = _convert_woodwork_types_wrapper(X.to_dataframe()) text_columns = self._get_text_columns(X) es = self._make_entity_set(X, text_columns) X_nlp_primitives = ft.calculate_feature_matrix(features=self._features, entityset=es) if X_nlp_primitives.isnull().any().any(): X_nlp_primitives.fillna(0, inplace=True) X_lsa = self._lsa.transform(X[text_columns]).to_dataframe() X_nlp_primitives.set_index(X.index, inplace=True) X_t = pd.concat( [X.drop(text_columns, axis=1), X_nlp_primitives, X_lsa], axis=1) return _convert_to_woodwork_structure(X_t)
def transform(self, X, y=None): """Transforms data X by creating new features using existing DateTime columns, and then dropping those DateTime columns Arguments: X (ww.DataTable, pd.DataFrame): Data to transform y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X """ X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) X_t = X features_to_extract = self.parameters["features_to_extract"] if len(features_to_extract) == 0: return _convert_to_woodwork_structure(X_t) for col_name in self._date_time_col_names: for feature in features_to_extract: name = f"{col_name}_{feature}" features, categories = self._function_mappings[feature]( X_t[col_name], self.encode_as_categories) X_t[name] = features if categories: self._categories[name] = categories X_t = X_t.drop(self._date_time_col_names, axis=1) return _convert_to_woodwork_structure(X_t)
def transform(self, X, y=None): X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) self._check_input_for_columns(X) cols = self.parameters.get("columns") or [] return self._modify_columns(cols, X, y)
def transform(self, X, y=None): """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are treated as the same. Arguments: X (pd.DataFrame): Data to transform y (pd.Series, optional): Ignored. Returns: pd.DataFrame: Transformed X """ X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) X_null_dropped = X.copy() X_null_dropped.drop(self._all_null_cols, inplace=True, axis=1, errors='ignore') if X_null_dropped.empty: return X_null_dropped if self._numeric_cols is not None and len(self._numeric_cols) > 0: X_numeric = X_null_dropped[self._numeric_cols] imputed = self._numeric_imputer.transform(X_numeric) imputed.index = X_null_dropped.index X_null_dropped[X_numeric.columns] = imputed if self._categorical_cols is not None and len(self._categorical_cols) > 0: X_categorical = X_null_dropped[self._categorical_cols] imputed = self._categorical_imputer.transform(X_categorical) imputed.index = X_null_dropped.index X_null_dropped[X_categorical.columns] = imputed return X_null_dropped
def transform(self, X, y=None): """Transforms input data by selecting features. If the component_obj does not have a transform method, will raise an MethodPropertyNotFoundError exception. Arguments: X (ww.DataTable, pd.DataFrame): Data to transform. y (ww.DataColumn, pd.Series, optional): Target data. Ignored. Returns: ww.DataTable: Transformed X """ X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) self.input_feature_names = list(X.columns.values) try: X_t = self._component_obj.transform(X) except AttributeError: raise MethodPropertyNotFoundError( "Feature selector requires a transform method or a component_obj that implements transform" ) X_dtypes = X.dtypes.to_dict() selected_col_names = self.get_names() col_types = {key: X_dtypes[key] for key in selected_col_names} features = pd.DataFrame(X_t, columns=selected_col_names, index=X.index).astype(col_types) return _convert_to_woodwork_structure(features)
def transform(self, X, y=None): """Transforms input by imputing missing values. 'None' and np.nan values are treated as the same. Arguments: X (ww.DataTable, pd.DataFrame): Data to transform y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X """ X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool if (X.dtypes == bool).all(): return _convert_to_woodwork_structure(X) X_null_dropped = X.copy() X_null_dropped.drop(self._all_null_cols, axis=1, errors='ignore', inplace=True) X_t = self._component_obj.transform(X) if X_null_dropped.empty: X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns) return _convert_to_woodwork_structure(X_t) X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns) X_t = X_t.infer_objects() X_t.index = X_null_dropped.index return _convert_to_woodwork_structure(X_t)
def fit(self, X, y=None): """Fits imputers on input data Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to fit. y (ww.DataColumn, pd.Series, optional): The target training data of length [n_samples]. Ignored. Returns: self """ X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) self.imputers = dict() for column in X.columns: strategy_dict = self.impute_strategies.get(column, dict()) strategy = strategy_dict.get('impute_strategy', self.default_impute_strategy) fill_value = strategy_dict.get('fill_value', None) self.imputers[column] = SimpleImputer(impute_strategy=strategy, fill_value=fill_value) for column, imputer in self.imputers.items(): imputer.fit(X[[column]]) return self
def fit(self, X, y): X = _convert_to_woodwork_structure(X) if not is_all_numeric(X): raise ValueError("LDA input must be all numeric") y = _convert_to_woodwork_structure(y) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) n_features = X.shape[1] n_classes = y.nunique() n_components = self.parameters['n_components'] if n_components is not None and n_components > min( n_classes, n_features): raise ValueError(f"n_components value {n_components} is too large") self._component_obj.fit(X, y) return self
def fit(self, X, y=None): """Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. Arguments: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, optional): The target training data of length [n_samples] Returns: self """ X = _convert_to_woodwork_structure(X) cat_cols = list(X.select('category').columns) numeric_cols = list(X.select('numeric').columns) X = _convert_woodwork_types_wrapper(X.to_dataframe()) self._all_null_cols = set(X.columns) - set(X.dropna(axis=1, how='all').columns) X_copy = X.copy() X_null_dropped = X_copy.drop(self._all_null_cols, axis=1, errors='ignore') X_numerics = X_null_dropped[[col for col in numeric_cols if col not in self._all_null_cols]] if len(X_numerics.columns) > 0: self._numeric_imputer.fit(X_numerics, y) self._numeric_cols = X_numerics.columns X_categorical = X_null_dropped[[col for col in cat_cols if col not in self._all_null_cols]] if len(X_categorical.columns) > 0: self._categorical_imputer.fit(X_categorical, y) self._categorical_cols = X_categorical.columns return self
def fit(self, X, y=None): X = _convert_to_woodwork_structure(X) if not is_all_numeric(X): raise ValueError("PCA input must be all numeric") X = _convert_woodwork_types_wrapper(X.to_dataframe()) self._component_obj.fit(X) return self
def test_convert_woodwork_types_wrapper_dataframe(): X = pd.DataFrame({"Int series": pd.Series([1, 2, 3], dtype="Int64"), "Int array": pd.array([1, 2, 3], dtype="Int64"), "Int series with nan": pd.Series([1, 2, None], dtype="Int64"), "Int array with nan": pd.array([1, 2, None], dtype="Int64"), "string series": pd.Series(["a", "b", "a"], dtype="string"), "string array": pd.array(["a", "b", "a"], dtype="string"), "string series with nan": pd.Series(["a", "b", None], dtype="string"), "string array with nan": pd.array(["a", "b", None], dtype="string"), "boolean series": pd.Series([True, False, True], dtype="boolean"), "boolean array": pd.array([True, False, True], dtype="boolean"), "boolean series with nan": pd.Series([True, False, None], dtype="boolean"), "boolean array with nan": pd.array([True, False, None], dtype="boolean") }) X_expected = pd.DataFrame({"Int series": pd.Series([1, 2, 3], dtype="int64"), "Int array": pd.array([1, 2, 3], dtype="int64"), "Int series with nan": pd.Series([1, 2, np.nan], dtype="float64"), "Int array with nan": pd.array([1, 2, np.nan], dtype="float64"), "string series": pd.Series(["a", "b", "a"], dtype="object"), "string array": pd.array(["a", "b", "a"], dtype="object"), "string series with nan": pd.Series(["a", "b", np.nan], dtype="object"), "string array with nan": pd.array(["a", "b", np.nan], dtype="object"), "boolean series": pd.Series([True, False, True], dtype="bool"), "boolean array": pd.array([True, False, True], dtype="bool"), "boolean series with nan": pd.Series([True, False, np.nan], dtype="object"), "boolean array with nan": pd.array([True, False, np.nan], dtype="object") }) pd.testing.assert_frame_equal(X_expected, _convert_woodwork_types_wrapper(X))
def transform(self, X, y=None): """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are treated as the same. Arguments: X (ww.DataTable, pd.DataFrame): Data to transform y (ww.DataColumn, pd.Series, optional): Ignored. Returns: pd.DataFrame: Transformed X """ X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) # Convert None to np.nan, since None cannot be properly handled X = X.fillna(value=np.nan) # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool if (X.dtypes == bool).all(): return X X_null_dropped = X.copy() X_null_dropped.drop(self._all_null_cols, axis=1, errors='ignore', inplace=True) category_cols = X_null_dropped.select_dtypes(include=['category']).columns X_t = self._component_obj.transform(X) if X_null_dropped.empty: return pd.DataFrame(X_t, columns=X_null_dropped.columns) X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns) if len(category_cols) > 0: X_t[category_cols] = X_t[category_cols].astype('category') return X_t
def validate(self, X, y=None): """Checks if there are any columns in the input that are too unique in the case of classification problems or not unique enough in the case of regression problems. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Defaults to None. Returns: dict: dict with a DataCheckWarning if there are any too unique or not unique enough columns. Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'regression_unique_enough': [float(x) for x in range(100)], ... 'regression_not_unique_enough': [float(1) for x in range(100)] ... }) >>> uniqueness_check = UniquenessDataCheck(problem_type="regression", threshold=0.8) >>> assert uniqueness_check.validate(df) == {"errors": [],\ "warnings": [{"message": "Input columns (regression_not_unique_enough) for regression problem type are not unique enough.",\ "data_check_name": "UniquenessDataCheck",\ "level": "warning",\ "code": "NOT_UNIQUE_ENOUGH",\ "details": {"column": "regression_not_unique_enough", 'uniqueness_score': 0.0}}]} """ messages = {"warnings": [], "errors": []} X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) res = X.apply(UniquenessDataCheck.uniqueness_score) if is_regression(self.problem_type): not_unique_enough_cols = list(res.index[res < self.threshold]) messages["warnings"].extend([ DataCheckWarning( message=warning_not_unique_enough.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH, details={ "column": col_name, "uniqueness_score": res.loc[col_name] }).to_dict() for col_name in not_unique_enough_cols ]) elif is_multiclass(self.problem_type): too_unique_cols = list(res.index[res > self.threshold]) messages["warnings"].extend([ DataCheckWarning(message=warning_too_unique.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.TOO_UNIQUE, details={ "column": col_name, "uniqueness_score": res.loc[col_name] }).to_dict() for col_name in too_unique_cols ]) return messages
def fit(self, X, y=None): if X is None: X = pd.DataFrame() X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) self._num_features = X.shape[1] return self
def _encode_labels(self, y): y_encoded = _convert_to_woodwork_structure(y) y_encoded = _convert_woodwork_types_wrapper(y_encoded.to_series()) # change only if dtype isn't int if not is_integer_dtype(y_encoded): self._label_encoder = LabelEncoder() y_encoded = pd.Series(self._label_encoder.fit_transform(y_encoded), dtype='int64') return y_encoded
def fit(self, X, y=None): if y is None: raise ValueError("Cannot fit Baseline classifier if y is None") X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_to_woodwork_structure(y) y = _convert_woodwork_types_wrapper(y.to_series()) vals, counts = np.unique(y, return_counts=True) self._classes = list(vals) self._percentage_freq = counts.astype(float) / len(y) self._num_unique = len(self._classes) self._num_features = X.shape[1] if self.parameters["strategy"] == "mode": self._mode = y.mode()[0] return self