def fit_resample(self, X, y): """Resampling technique for this sampler. Arguments: X (pd.DataFrame): Training data to fit and resample y (pd.Series): Training data targets to fit and resample Returns: list: Indices to keep for training data """ X_ww = infer_feature_types(X) y_ww = infer_feature_types(y) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) y = _convert_woodwork_types_wrapper(y_ww.to_series()) result = self._find_ideal_samples(y) indices_to_drop = [] if len(result): # iterate through the classes we need to undersample and remove the number of samples we need to remove for key, value in result.items(): indices = y.index[y == key].values indices_to_remove = self.random_state.choice(indices, value, replace=False) indices_to_drop.extend(indices_to_remove) return list(set(list(y.index.values)).difference(set(indices_to_drop)))
def validate(self, X, y): """Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. If `method='mutual'`, supports all target and feature types. Otherwise, if `method='pearson'` only supports binary with numeric and boolean dtypes. Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check y (ww.DataColumn, pd.Series, np.ndarray): The target data Returns: dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected. Example: >>> import pandas as pd >>> X = pd.DataFrame({ ... 'leak': [10, 42, 31, 51, 61], ... 'x': [42, 54, 12, 64, 12], ... 'y': [13, 5, 13, 74, 24], ... }) >>> y = pd.Series([10, 42, 31, 51, 40]) >>> target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.95) >>> assert target_leakage_check.validate(X, y) == {"warnings": [{"message": "Column 'leak' is 95.0% or more correlated with the target",\ "data_check_name": "TargetLeakageDataCheck",\ "level": "warning",\ "code": "TARGET_LEAKAGE",\ "details": {"column": "leak"}}],\ "errors": [],\ "actions": [{"code": "DROP_COL",\ "metadata": {"column": "leak"}}]} """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) y = infer_feature_types(y) if self.method == 'pearson': highly_corr_cols = self._calculate_pearson(X, y) else: X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) highly_corr_cols = self._calculate_mutual_information(X, y) warning_msg = "Column '{}' is {}% or more correlated with the target" results["warnings"].extend([ DataCheckWarning(message=warning_msg.format( col_name, self.pct_corr_threshold * 100), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": col_name }).to_dict() for col_name in highly_corr_cols ]) results["actions"].extend([ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": col_name }).to_dict() for col_name in highly_corr_cols ]) return results
def _calculate_pearson(self, X, y): highly_corr_cols = [] X_num = X.select(include=numeric_and_boolean_ww) if y.logical_type not in numeric_and_boolean_ww or len(X_num.columns) == 0: return highly_corr_cols X_num = _convert_woodwork_types_wrapper(X_num.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) highly_corr_cols = [label for label, col in X_num.iteritems() if abs(y.corr(col)) >= self.pct_corr_threshold] return highly_corr_cols
def fit_resample(self, X, y): """Resampling technique for this sampler. Arguments: X (pd.DataFrame): Training data to fit and resample y (pd.Series): Training data targets to fit and resample Returns: list: Indices to keep for training data """ y_ww = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y_ww.to_series()) # if we have a dictionary provided, opt to use that if len(self.sampling_ratio_dict): result = self._sampling_dict_to_remove_dict(y) else: result = self._find_ideal_samples(y) indices_to_drop = [] if len(result): # iterate through the classes we need to undersample and remove the number of samples we need to remove for key, value in result.items(): indices = y.index[y == key].values indices_to_remove = self.random_state.choice(indices, value, replace=False) indices_to_drop.extend(indices_to_remove) # indices of the y datacolumn original_indices = list( set(y.index.values).difference(set(indices_to_drop))) return original_indices
def validate(self, X, y=None): """Checks if there are any columns in the input that are too unique in the case of classification problems or not unique enough in the case of regression problems. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Defaults to None. Returns: dict: dict with a DataCheckWarning if there are any too unique or not unique enough columns. Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'regression_unique_enough': [float(x) for x in range(100)], ... 'regression_not_unique_enough': [float(1) for x in range(100)] ... }) >>> uniqueness_check = UniquenessDataCheck(problem_type="regression", threshold=0.8) >>> assert uniqueness_check.validate(df) == {"errors": [],\ "warnings": [{"message": "Input columns (regression_not_unique_enough) for regression problem type are not unique enough.",\ "data_check_name": "UniquenessDataCheck",\ "level": "warning",\ "code": "NOT_UNIQUE_ENOUGH",\ "details": {"column": "regression_not_unique_enough", 'uniqueness_score': 0.0}}],\ "actions": []} """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) res = X.apply(UniquenessDataCheck.uniqueness_score) if is_regression(self.problem_type): not_unique_enough_cols = list(res.index[res < self.threshold]) results["warnings"].extend([ DataCheckWarning( message=warning_not_unique_enough.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH, details={ "column": col_name, "uniqueness_score": res.loc[col_name] }).to_dict() for col_name in not_unique_enough_cols ]) elif is_multiclass(self.problem_type): too_unique_cols = list(res.index[res > self.threshold]) results["warnings"].extend([ DataCheckWarning(message=warning_too_unique.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.TOO_UNIQUE, details={ "column": col_name, "uniqueness_score": res.loc[col_name] }).to_dict() for col_name in too_unique_cols ]) return results
def _prepare_data(self, X, y): """Transforms the input data to pandas data structure that our sampler can ingest. Arguments: X (ww.DataFrame): Training features y (ww.DataColumn): Target features Returns: ww.DataTable, ww.DataColumn, pd.DataFrame, pd.Series: Prepared X and y data, both woodwork and pandas """ X = infer_feature_types(X) if y is None: raise ValueError("y cannot be none") y = infer_feature_types(y) X_pd = _convert_woodwork_types_wrapper(X.to_dataframe()) y_pd = _convert_woodwork_types_wrapper(y.to_series()) return X, y, X_pd, y_pd
def split(self, X, y): """Splits and returns the indices of the training and testing using the data sampler provided. Arguments: X (ww.DataTable): DataTable of points to split y (ww.DataTable): DataColumn of points to split Returns: tuple(train, test): A tuple containing the resulting train and test indices, post sampling. """ X_ww = infer_feature_types(X) y_ww = infer_feature_types(y) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) y = _convert_woodwork_types_wrapper(y_ww.to_series()) index_df = pd.Series(y.index) for train, test in self.splitter.split(X, y): X_train, y_train = X.iloc[train], y.iloc[train] train_index_drop = self.sampler.fit_resample(X_train, y_train) # convert the indices of the y column into index indices of the original pre-split y train_indices = index_df[index_df.isin(train_index_drop)].dropna().index.values.tolist() yield iter([train_indices, test])
def validate(self, X, y=None): """Calculates what percentage of each column's unique values exceed the count threshold and compare that percentage to the sparsity threshold stored in the class instance. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Returns: dict: dict with a DataCheckWarning if there are any sparse columns. Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'sparse': [float(x) for x in range(100)], ... 'not_sparse': [float(1) for x in range(100)] ... }) >>> sparsity_check = SparsityDataCheck(problem_type="multiclass", threshold=0.5, unique_count_threshold=10) >>> assert sparsity_check.validate(df) == {"errors": [],\ "warnings": [{"message": "Input columns (sparse) for multiclass problem type are too sparse.",\ "data_check_name": "SparsityDataCheck",\ "level": "warning",\ "code": "TOO_SPARSE",\ "details": {"column": "sparse", 'sparsity_score': 0.0}}],\ "actions": [{"code": "DROP_COL",\ "metadata": {"column": "sparse"}}]} """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) res = X.apply(SparsityDataCheck.sparsity_score, count_threshold=self.unique_count_threshold) too_sparse_cols = [col for col in res.index[res < self.threshold]] results["warnings"].extend([ DataCheckWarning(message=warning_too_unique.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.TOO_SPARSE, details={ "column": col_name, "sparsity_score": res.loc[col_name] }).to_dict() for col_name in too_sparse_cols ]) results["actions"].extend([ DataCheckAction(action_code=DataCheckActionCode.DROP_COL, metadata={ "column": col_name }).to_dict() for col_name in too_sparse_cols ]) return results
def transform_sample(self, X, y): """Transforms the input data with the balancing strategy. Arguments: X (ww.DataTable): DataTable of points to split y (ww.DataTable): DataColumn of points to split Returns: list: List of indices to keep """ y_ww = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y_ww.to_series()) index_df = pd.Series(y.index) train_index_drop = self.sampler.fit_resample(X, y) # convert the indices of the y column into index indices of the original pre-split y train_indices = index_df[index_df.isin(train_index_drop)].dropna().index.values.tolist() return train_indices
def validate(self, X, y=None): """Checks if any datetime columns contain NaN values. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Defaults to None. Returns: dict: dict with a DataCheckError if NaN values are present in datetime columns. Example: >>> import pandas as pd >>> import woodwork as ww >>> import numpy as np >>> dates = np.arange(np.datetime64('2017-01-01'), np.datetime64('2017-01-08')) >>> dates[0] = np.datetime64('NaT') >>> ww_input = ww.DataTable(pd.DataFrame(dates, columns=['index'])) >>> dt_nan_check = DateTimeNaNDataCheck() >>> assert dt_nan_check.validate(ww_input) == {"warnings": [], ... "actions": [], ... "errors": [DataCheckError(message='Input datetime column(s) (index) contains NaN values. Please impute NaN values or drop these rows or columns.', ... data_check_name=DateTimeNaNDataCheck.name, ... message_code=DataCheckMessageCode.DATETIME_HAS_NAN, ... details={"columns": 'index'}).to_dict()]} """ results = { "warnings": [], "errors": [], "actions": [] } X = infer_feature_types(X) datetime_cols = _convert_woodwork_types_wrapper(X.select("datetime").to_dataframe()) nan_columns = datetime_cols.columns[datetime_cols.isna().any()].tolist() if len(nan_columns) > 0: nan_columns = [str(col) for col in nan_columns] cols_str = ', '.join(nan_columns) if len(nan_columns) > 1 else nan_columns[0] results["errors"].append(DataCheckError(message=error_contains_nan.format(cols_str), data_check_name=self.name, message_code=DataCheckMessageCode.DATETIME_HAS_NAN, details={"columns": cols_str}).to_dict()) return results
def train_and_score_pipeline(pipeline, automl, full_X_train, full_y_train): """Given a pipeline, config and data, train and score the pipeline and return the CV or TV scores Arguments: pipeline (PipelineBase): The pipeline to score automl (AutoMLSearch): The AutoML search, used to access config and for the error callback full_X_train (ww.DataTable): Training features full_y_train (ww.DataColumn): Training target Returns: dict: A dict containing cv_score_mean, cv_scores, training_time and a cv_data structure with details. """ start = time.time() cv_data = [] logger.info("\tStarting cross validation") X_pd = _convert_woodwork_types_wrapper(full_X_train.to_dataframe()) y_pd = _convert_woodwork_types_wrapper(full_y_train.to_series()) y_pd_encoded = y_pd # Encode target for classification problems so that we can support float targets. This is okay because we only use split to get the indices to split on if is_classification(automl.problem_type): y_mapping = { original_target: encoded_target for (encoded_target, original_target) in enumerate(y_pd.value_counts().index) } y_pd_encoded = y_pd.map(y_mapping) for i, (train, valid) in enumerate( automl.data_splitter.split(X_pd, y_pd_encoded)): if pipeline.model_family == ModelFamily.ENSEMBLE and i > 0: # Stacked ensembles do CV internally, so we do not run CV here for performance reasons. logger.debug( f"Skipping fold {i} because CV for stacked ensembles is not supported." ) break logger.debug(f"\t\tTraining and scoring on fold {i}") X_train, X_valid = full_X_train.iloc[train], full_X_train.iloc[ valid] y_train, y_valid = full_y_train.iloc[train], full_y_train.iloc[ valid] if is_binary(automl.problem_type) or is_multiclass( automl.problem_type): diff_train = set( np.setdiff1d(full_y_train.to_series(), y_train.to_series())) diff_valid = set( np.setdiff1d(full_y_train.to_series(), y_valid.to_series())) diff_string = f"Missing target values in the training set after data split: {diff_train}. " if diff_train else "" diff_string += f"Missing target values in the validation set after data split: {diff_valid}." if diff_valid else "" if diff_string: raise Exception(diff_string) objectives_to_score = [automl.objective ] + automl.additional_objectives cv_pipeline = None try: logger.debug(f"\t\t\tFold {i}: starting training") cv_pipeline = EngineBase.train_pipeline( pipeline, X_train, y_train, automl.optimize_thresholds, automl.objective) logger.debug(f"\t\t\tFold {i}: finished training") if automl.optimize_thresholds and pipeline.can_tune_threshold_with_objective( automl.objective ) and automl.objective.can_optimize_threshold: logger.debug( f"\t\t\tFold {i}: Optimal threshold found ({cv_pipeline.threshold:.3f})" ) logger.debug(f"\t\t\tFold {i}: Scoring trained pipeline") scores = cv_pipeline.score(X_valid, y_valid, objectives=objectives_to_score) logger.debug( f"\t\t\tFold {i}: {automl.objective.name} score: {scores[automl.objective.name]:.3f}" ) score = scores[automl.objective.name] except Exception as e: if automl.error_callback is not None: automl.error_callback(exception=e, traceback=traceback.format_tb( sys.exc_info()[2]), automl=automl, fold_num=i, pipeline=pipeline) if isinstance(e, PipelineScoreError): nan_scores = { objective: np.nan for objective in e.exceptions } scores = {**nan_scores, **e.scored_successfully} scores = OrderedDict({ o.name: scores[o.name] for o in [automl.objective] + automl.additional_objectives }) score = scores[automl.objective.name] else: score = np.nan scores = OrderedDict( zip([n.name for n in automl.additional_objectives], [np.nan] * len(automl.additional_objectives))) ordered_scores = OrderedDict() ordered_scores.update({automl.objective.name: score}) ordered_scores.update(scores) ordered_scores.update({"# Training": y_train.shape[0]}) ordered_scores.update({"# Validation": y_valid.shape[0]}) evaluation_entry = { "all_objective_scores": ordered_scores, "score": score, 'binary_classification_threshold': None } if is_binary( automl.problem_type ) and cv_pipeline is not None and cv_pipeline.threshold is not None: evaluation_entry[ 'binary_classification_threshold'] = cv_pipeline.threshold cv_data.append(evaluation_entry) training_time = time.time() - start cv_scores = pd.Series([fold['score'] for fold in cv_data]) cv_score_mean = cv_scores.mean() logger.info( f"\tFinished cross validation - mean {automl.objective.name}: {cv_score_mean:.3f}" ) return { 'cv_data': cv_data, 'training_time': training_time, 'cv_scores': cv_scores, 'cv_score_mean': cv_score_mean }
def validate(self, X, y): """Checks if the target data contains missing or invalid values. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored. y (ww.DataColumn, pd.Series, np.ndarray): Target data to check for invalid values. Returns: dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data. Example: >>> import pandas as pd >>> X = pd.DataFrame({"col": [1, 2, 3, 1]}) >>> y = pd.Series([0, 1, None, None]) >>> target_check = InvalidTargetDataCheck('binary', 'Log Loss Binary') >>> assert target_check.validate(X, y) == {"errors": [{"message": "2 row(s) (50.0%) of target values are null",\ "data_check_name": "InvalidTargetDataCheck",\ "level": "error",\ "code": "TARGET_HAS_NULL",\ "details": {"num_null_rows": 2, "pct_null_rows": 50}}],\ "warnings": [],\ "actions": [{'code': 'IMPUTE_COL', 'metadata': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]} """ results = {"warnings": [], "errors": [], "actions": []} if y is None: results["errors"].append( DataCheckError( message="Target is None", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_IS_NONE, details={}).to_dict()) return results y = infer_feature_types(y) is_supported_type = y.logical_type in numeric_and_boolean_ww + [ ww.logical_types.Categorical ] if not is_supported_type: results["errors"].append( DataCheckError( message= "Target is unsupported {} type. Valid Woodwork logical types include: {}" .format( y.logical_type, ", ".join([ ltype.type_string for ltype in numeric_and_boolean_ww ])), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={ "unsupported_type": y.logical_type.type_string }).to_dict()) y_df = _convert_woodwork_types_wrapper(y.to_series()) null_rows = y_df.isnull() if null_rows.all(): results["errors"].append( DataCheckError(message="Target is either empty or fully null.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_IS_EMPTY_OR_FULLY_NULL, details={}).to_dict()) return results elif null_rows.any(): num_null_rows = null_rows.sum() pct_null_rows = null_rows.mean() * 100 results["errors"].append( DataCheckError( message="{} row(s) ({}%) of target values are null".format( num_null_rows, pct_null_rows), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={ "num_null_rows": num_null_rows, "pct_null_rows": pct_null_rows }).to_dict()) impute_strategy = "mean" if is_regression( self.problem_type) else "most_frequent" results["actions"].append( DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={ "column": None, "is_target": True, "impute_strategy": impute_strategy }).to_dict()) value_counts = y_df.value_counts() unique_values = value_counts.index.tolist() if is_binary(self.problem_type) and len(value_counts) != 2: if self.n_unique is None: details = {"target_values": unique_values} else: details = { "target_values": unique_values[:min(self.n_unique, len(unique_values))] } results["errors"].append( DataCheckError( message= "Binary class targets require exactly two unique values.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details=details).to_dict()) if self.problem_type == ProblemTypes.REGRESSION and "numeric" not in y.semantic_tags: results["errors"].append( DataCheckError( message= "Target data type should be numeric for regression type problems.", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={}).to_dict()) if is_multiclass(self.problem_type): if value_counts.min() <= 1: least_populated = value_counts[value_counts <= 1] details = { "least_populated_class_labels": least_populated.index.tolist() } results["errors"].append( DataCheckError( message= "Target does not have at least two instances per class which is required for multiclass classification", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, details=details).to_dict()) if len(unique_values) <= 2: details = {"num_classes": len(unique_values)} results["errors"].append( DataCheckError( message= "Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, details=details).to_dict()) num_class_to_num_value_ratio = len(unique_values) / len(y) if num_class_to_num_value_ratio >= self.multiclass_continuous_threshold: details = { "class_to_value_ratio": num_class_to_num_value_ratio } results["warnings"].append( DataCheckWarning( message= "Target has a large number of unique values, could be regression type problem.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details=details).to_dict()) any_neg = not (y_df > 0).all() if y.logical_type in [ ww.logical_types.Integer, ww.logical_types.Double ] else None if any_neg and self.objective.positive_only: details = { "Count of offending values": sum(val <= 0 for val in y_df.values.flatten()) } results["errors"].append( DataCheckError( message= f"Target has non-positive values which is not supported for {self.objective.name}", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_INCOMPATIBLE_OBJECTIVE, details=details).to_dict()) if X is not None: X = infer_feature_types(X) X_index = list(X.to_dataframe().index) y_index = list(y_df.index) X_length = len(X_index) y_length = len(y_index) if X_length != y_length: results["warnings"].append( DataCheckWarning( message= "Input target and features have different lengths", data_check_name=self.name, message_code=DataCheckMessageCode.MISMATCHED_LENGTHS, details={ "features_length": X_length, "target_length": y_length }).to_dict()) if X_index != y_index: if set(X_index) == set(y_index): results["warnings"].append( DataCheckWarning( message= "Input target and features have mismatched indices order", data_check_name=self.name, message_code=DataCheckMessageCode. MISMATCHED_INDICES_ORDER, details={}).to_dict()) else: index_diff_not_in_X = list(set(y_index) - set(X_index))[:10] index_diff_not_in_y = list(set(X_index) - set(y_index))[:10] results["warnings"].append( DataCheckWarning( message= "Input target and features have mismatched indices", data_check_name=self.name, message_code=DataCheckMessageCode. MISMATCHED_INDICES, details={ "indices_not_in_features": index_diff_not_in_X, "indices_not_in_target": index_diff_not_in_y }).to_dict()) return results