def test_make_component_list_from_actions(): assert _make_component_list_from_actions([]) == [] actions = [ DataCheckAction(DataCheckActionCode.DROP_COL, {"columns": ['some col']}) ] assert _make_component_list_from_actions(actions) == [ DropColumns(columns=['some col']) ] actions = [ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"columns": ['some col']}), DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={ "column": None, "is_target": True, "impute_strategy": "most_frequent" }) ] assert _make_component_list_from_actions(actions) == [ DropColumns(columns=['some col']), TargetImputer(impute_strategy="most_frequent") ]
def test_highly_null_data_check_input_formats(): highly_null_check = HighlyNullDataCheck(pct_null_threshold=0.8) # test empty pd.DataFrame assert highly_null_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []} expected = { "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": 0}).to_dict(), DataCheckWarning(message="Column '1' is 80.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": 1}).to_dict(), DataCheckWarning(message="Column '2' is 80.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": 2}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 0}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 1}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 2}).to_dict()] } # test Woodwork ww_input = ww.DataTable(pd.DataFrame([[None, None, None, None, 0], [None, None, None, "hi", 5]])) assert highly_null_check.validate(ww_input) == expected # test 2D list assert highly_null_check.validate([[None, None, None, None, 0], [None, None, None, "hi", 5]]) == expected # test np.array assert highly_null_check.validate(np.array([[None, None, None, None, 0], [None, None, None, "hi", 5]])) == expected
def test_uniqueness_data_check_warnings(): data = pd.DataFrame({'regression_unique_enough': [float(x) for x in range(100)], 'regression_not_unique_enough': [float(1) for x in range(100)]}) uniqueness_check = UniquenessDataCheck(problem_type="regression") assert uniqueness_check.validate(data) == { "warnings": [DataCheckWarning( message="Input columns (regression_not_unique_enough) for regression problem type are not unique enough.", data_check_name=uniqueness_data_check_name, message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH, details={"column": "regression_not_unique_enough", 'uniqueness_score': 0.0}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 'regression_not_unique_enough'}).to_dict()] } data = pd.DataFrame({'multiclass_too_unique': ["Cats", "Are", "Absolutely", "The", "Best"] * 20, 'multiclass_not_too_unique': ["Cats", "Cats", "Best", "Best", "Best"] * 20}) uniqueness_check = UniquenessDataCheck(problem_type="multiclass") assert uniqueness_check.validate(data) == { "warnings": [DataCheckWarning( message="Input columns (multiclass_too_unique) for multiclass problem type are too unique.", data_check_name=uniqueness_data_check_name, message_code=DataCheckMessageCode.TOO_UNIQUE, details={"column": "multiclass_too_unique", 'uniqueness_score': 0.7999999999999999}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 'multiclass_too_unique'}).to_dict()] }
def test_sparsity_data_check_warnings(): data = pd.DataFrame({ 'most_sparse': [float(x) for x in range(10)], # [0,1,2,3,4,5,6,7,8,9] 'more_sparse': [x % 5 for x in range(10)], # [0,1,2,3,4,0,1,2,3,4] 'sparse': [x % 3 for x in range(10)], # [0,1,2,0,1,2,0,1,2,0] 'less_sparse': [x % 2 for x in range(10)], # [0,1,0,1,0,1,0,1,0,1] 'not_sparse': [float(1) for x in range(10)] }) # [1,1,1,1,1,1,1,1,1,1] sparsity_check = SparsityDataCheck(problem_type="multiclass", threshold=.4, unique_count_threshold=3) assert sparsity_check.validate(data) == { "warnings": [ DataCheckWarning( message= "Input columns (most_sparse) for multiclass problem type are too sparse.", data_check_name=sparsity_data_check_name, message_code=DataCheckMessageCode.TOO_SPARSE, details={ "column": "most_sparse", 'sparsity_score': 0 }).to_dict(), DataCheckWarning( message= "Input columns (more_sparse) for multiclass problem type are too sparse.", data_check_name=sparsity_data_check_name, message_code=DataCheckMessageCode.TOO_SPARSE, details={ "column": "more_sparse", 'sparsity_score': 0 }).to_dict(), DataCheckWarning( message= "Input columns (sparse) for multiclass problem type are too sparse.", data_check_name=sparsity_data_check_name, message_code=DataCheckMessageCode.TOO_SPARSE, details={ "column": "sparse", 'sparsity_score': 0.3333333333333333 }).to_dict() ], "errors": [], "actions": [ DataCheckAction(DataCheckActionCode.DROP_COL, details={ "column": 'most_sparse' }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={ "column": 'more_sparse' }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={ "column": 'sparse' }).to_dict() ] }
def test_data_check_action_to_dict(): data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL) data_check_action_empty_details = DataCheckAction(DataCheckActionCode.DROP_COL, details={}) data_check_action_with_details = DataCheckAction(DataCheckActionCode.DROP_COL, details={"some detail": ["this is different"]}) assert data_check_action.to_dict() == {"code": DataCheckActionCode.DROP_COL.name, "details": {}} assert data_check_action_empty_details.to_dict() == {"code": DataCheckActionCode.DROP_COL.name, "details": {}} assert data_check_action_with_details.to_dict() == {"code": DataCheckActionCode.DROP_COL.name, "details": {"some detail": ["this is different"]}}
def test_data_check_action_inequality(): data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL) data_check_action_diff = DataCheckAction( DataCheckActionCode.DROP_COL, metadata={"metadata": ["this is different"]}) assert data_check_action != data_check_action_diff assert data_check_action_diff != data_check_action
def test_default_data_checks_regression(input_type): X = pd.DataFrame({'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 5, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100], 'natural_language_nan': [None, "string_that_is_long_enough_for_natural_language_1", "string_that_is_long_enough_for_natural_language_2", "string_that_is_long_enough_for_natural_language_3", "string_that_is_long_enough_for_natural_language_4"], 'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5))}) X['nan_dt_col'][0] = None y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_no_variance = ww.DataColumn(y_no_variance) null_leakage = [DataCheckWarning(message="Column 'lots_of_null' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "lots_of_null"}).to_dict()] data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression")) id_leakage_warning = [DataCheckWarning(message="Column 'id' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "id"}).to_dict()] nan_dt_leakage_warning = [DataCheckWarning(message="Column 'nan_dt_col' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "nan_dt_col"}).to_dict()] impute_action = DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, 'impute_strategy': 'mean'}).to_dict() nan_dt_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'nan_dt_col'}).to_dict() expected_actions_with_drop_and_impute = expected_actions[:3] + [nan_dt_action, impute_action] + expected_actions[4:] assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, "errors": messages[3:], "actions": expected_actions_with_drop_and_impute} # Skip Invalid Target assert data_checks.validate(X, y_no_variance) == { "warnings": messages[:3] + null_leakage, "errors": messages[4:7] + [DataCheckError(message="Y has 1 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": "Y"}).to_dict()] + messages[7:], "actions": expected_actions[:3] + expected_actions[4:] } data_checks = DataChecks(DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {"InvalidTargetDataCheck": {"problem_type": "regression", "objective": get_default_primary_search_objective("regression")}}) assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, "errors": messages[3:], "actions": expected_actions_with_drop_and_impute}
def test_make_component_list_from_actions(): assert _make_component_list_from_actions([]) == [] actions = [DataCheckAction(DataCheckActionCode.DROP_COL, {"columns": ['some col']})] assert _make_component_list_from_actions(actions) == [DropColumns(columns=['some col'])] actions_same_code = [DataCheckAction(DataCheckActionCode.DROP_COL, {"columns": ['some col']}), DataCheckAction(DataCheckActionCode.DROP_COL, {"columns": ['some other col']})] assert _make_component_list_from_actions(actions_same_code) == [DropColumns(columns=['some col']), DropColumns(columns=['some other col'])]
def test_data_check_action_attributes(): data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL) assert data_check_action.action_code == DataCheckActionCode.DROP_COL assert data_check_action.details == {} data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL, {}) assert data_check_action.action_code == DataCheckActionCode.DROP_COL assert data_check_action.details == {} data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL, details={"columns": [1, 2]}) assert data_check_action.action_code == DataCheckActionCode.DROP_COL assert data_check_action.details == {"columns": [1, 2]}
def test_data_check_action_equality(): data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL) data_check_action_eq = DataCheckAction(DataCheckActionCode.DROP_COL) assert data_check_action == data_check_action assert data_check_action == data_check_action_eq assert data_check_action_eq == data_check_action data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL, details={'same detail': 'same same same'}) data_check_action_eq = DataCheckAction(DataCheckActionCode.DROP_COL, details={'same detail': 'same same same'}) assert data_check_action == data_check_action assert data_check_action == data_check_action_eq assert data_check_action_eq == data_check_action
def test_data_checks_do_not_duplicate_actions(X_y_binary): X, y = X_y_binary class MockDataCheck(DataCheck): def validate(self, X, y): return { "warnings": [], "errors": [], "actions": [ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'col_to_drop' }).to_dict() ] } class MockDataCheckWithSameAction(DataCheck): def validate(self, X, y): return {"warnings": [], "errors": [], "actions": []} data_checks_list = [MockDataCheck, MockDataCheckWithSameAction] data_checks = DataChecks(data_checks=data_checks_list) # Check duplicate actions are returned once assert data_checks.validate(X, y) == { "warnings": [], "errors": [], "actions": [ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'col_to_drop' }).to_dict() ] }
def validate(self, X, y): """Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. If `method='mutual'`, supports all target and feature types. Otherwise, if `method='pearson'` only supports binary with numeric and boolean dtypes. Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check y (ww.DataColumn, pd.Series, np.ndarray): The target data Returns: dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected. Example: >>> import pandas as pd >>> X = pd.DataFrame({ ... 'leak': [10, 42, 31, 51, 61], ... 'x': [42, 54, 12, 64, 12], ... 'y': [13, 5, 13, 74, 24], ... }) >>> y = pd.Series([10, 42, 31, 51, 40]) >>> target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.95) >>> assert target_leakage_check.validate(X, y) == {"warnings": [{"message": "Column 'leak' is 95.0% or more correlated with the target",\ "data_check_name": "TargetLeakageDataCheck",\ "level": "warning",\ "code": "TARGET_LEAKAGE",\ "details": {"column": "leak"}}],\ "errors": [],\ "actions": [{"code": "DROP_COL",\ "metadata": {"column": "leak"}}]} """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) y = infer_feature_types(y) if self.method == 'pearson': highly_corr_cols = self._calculate_pearson(X, y) else: X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) highly_corr_cols = self._calculate_mutual_information(X, y) warning_msg = "Column '{}' is {}% or more correlated with the target" results["warnings"].extend([ DataCheckWarning(message=warning_msg.format( col_name, self.pct_corr_threshold * 100), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": col_name }).to_dict() for col_name in highly_corr_cols ]) results["actions"].extend([ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": col_name }).to_dict() for col_name in highly_corr_cols ]) return results
def test_invalid_target_data_action_for_data_with_null(problem_type): y = pd.Series([None, None, None, 0, 0, 0, 0, 0, 0, 0]) X = pd.DataFrame({"col": range(len(y))}) invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type)) impute_strategy = "mean" if is_regression(problem_type) else "most_frequent" expected = { "warnings": [], "errors": [DataCheckError(message="3 row(s) (30.0%) of target values are null", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={"num_null_rows": 3, "pct_null_rows": 30.0}).to_dict()], "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict()] } if is_binary(problem_type): expected["errors"].append(DataCheckError(message="Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={"target_values": [0]}).to_dict()) elif is_multiclass(problem_type): expected["errors"].append(DataCheckError(message=f"Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, details={"num_classes": 1}).to_dict()) expected["warnings"].append(DataCheckWarning(message=f"Target has a large number of unique values, could be regression type problem.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details={"class_to_value_ratio": 0.1}).to_dict()) messages = invalid_targets_check.validate(X, y) assert messages == expected
def validate(self, X, y=None): """Checks if there are any highly-null columns in the input. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Returns: dict: dict with a DataCheckWarning if there are any highly-null columns. Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'lots_of_null': [None, None, None, None, 5], ... 'no_null': [1, 2, 3, 4, 5] ... }) >>> null_check = HighlyNullDataCheck(pct_null_threshold=0.8) >>> assert null_check.validate(df) == {"errors": [],\ "warnings": [{"message": "Column 'lots_of_null' is 80.0% or more null",\ "data_check_name": "HighlyNullDataCheck",\ "level": "warning",\ "code": "HIGHLY_NULL",\ "details": {"column": "lots_of_null"}}],\ "actions": [{"code": "DROP_COL",\ "metadata": {"column": "lots_of_null"}}]} """ results = { "warnings": [], "errors": [], "actions": [] } X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) percent_null = (X.isnull().mean()).to_dict() highly_null_cols = [] if self.pct_null_threshold == 0.0: highly_null_cols = {key: value for key, value in percent_null.items() if value > 0.0} warning_msg = "Column '{}' is more than 0% null" results["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name), data_check_name=self.name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": col_name}).to_dict() for col_name in highly_null_cols]) else: highly_null_cols = {key: value for key, value in percent_null.items() if value >= self.pct_null_threshold} warning_msg = "Column '{}' is {}% or more null" results["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name, self.pct_null_threshold * 100), data_check_name=self.name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": col_name}).to_dict() for col_name in highly_null_cols]) results["actions"].extend([DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": col_name}).to_dict() for col_name in highly_null_cols]) return results
def test_default_data_checks_null_rows(): class SeriesWrap(): def __init__(self, series): self.series = series def __eq__(self, series_2): return all(self.series.eq(series_2.series)) X = pd.DataFrame({'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None]}) y = pd.Series([0, 1, np.nan, 1, 0]) data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression")) highly_null_rows = SeriesWrap(pd.Series([1.0, 1.0, 1.0, 1.0, 1.0])) expected = { "warnings": [DataCheckWarning(message="5 out of 5 rows are more than 95.0% null", data_check_name="HighlyNullDataCheck", message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, details={"pct_null_cols": highly_null_rows}).to_dict(), DataCheckWarning(message="Column 'all_null' is 95.0% or more null", data_check_name="HighlyNullDataCheck", message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": 'all_null', "pct_null_rows": 1.0}).to_dict(), DataCheckWarning(message="Column 'also_all_null' is 95.0% or more null", data_check_name="HighlyNullDataCheck", message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": 'also_all_null', "pct_null_rows": 1.0}).to_dict()], "errors": [DataCheckError(message="1 row(s) (20.0%) of target values are null", data_check_name="InvalidTargetDataCheck", message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={"num_null_rows": 1, "pct_null_rows": 20.0}).to_dict(), DataCheckError(message="all_null has 0 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": "all_null"}).to_dict(), DataCheckError(message="also_all_null has 0 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": "also_all_null"}).to_dict()], "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3, 4]}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'also_all_null'}).to_dict(), DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "mean"}).to_dict()]} validation_results = data_checks.validate(X, y) validation_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validation_results['warnings'][0]['details']['pct_null_cols']) assert validation_results == expected
def test_highly_null_data_check_warnings(): data = pd.DataFrame({'lots_of_null': [None, None, None, None, 5], 'all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 4, 5]}) no_null_check = HighlyNullDataCheck(pct_null_threshold=0.0) assert no_null_check.validate(data) == { "warnings": [DataCheckWarning(message="Column 'lots_of_null' is more than 0% null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": "lots_of_null"}).to_dict(), DataCheckWarning(message="Column 'all_null' is more than 0% null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": "all_null"}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()] } some_null_check = HighlyNullDataCheck(pct_null_threshold=0.5) assert some_null_check.validate(data) == { "warnings": [DataCheckWarning(message="Column 'lots_of_null' is 50.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": "lots_of_null"}).to_dict(), DataCheckWarning(message="Column 'all_null' is 50.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": "all_null"}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()] } all_null_check = HighlyNullDataCheck(pct_null_threshold=1.0) assert all_null_check.validate(data) == { "warnings": [DataCheckWarning(message="Column 'all_null' is 100.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": "all_null"}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()] }
def validate(self, X, y): return { "warnings": [], "errors": [], "actions": [ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'col_to_drop' }).to_dict() ] }
def validate(self, X, y=None): """Calculates what percentage of each column's unique values exceed the count threshold and compare that percentage to the sparsity threshold stored in the class instance. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Returns: dict: dict with a DataCheckWarning if there are any sparse columns. Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'sparse': [float(x) for x in range(100)], ... 'not_sparse': [float(1) for x in range(100)] ... }) >>> sparsity_check = SparsityDataCheck(problem_type="multiclass", threshold=0.5, unique_count_threshold=10) >>> assert sparsity_check.validate(df) == {"errors": [],\ "warnings": [{"message": "Input columns (sparse) for multiclass problem type are too sparse.",\ "data_check_name": "SparsityDataCheck",\ "level": "warning",\ "code": "TOO_SPARSE",\ "details": {"column": "sparse", 'sparsity_score': 0.0}}],\ "actions": [{"code": "DROP_COL",\ "metadata": {"column": "sparse"}}]} """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) res = X.apply(SparsityDataCheck.sparsity_score, count_threshold=self.unique_count_threshold) too_sparse_cols = [col for col in res.index[res < self.threshold]] results["warnings"].extend([ DataCheckWarning(message=warning_too_unique.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.TOO_SPARSE, details={ "column": col_name, "sparsity_score": res.loc[col_name] }).to_dict() for col_name in too_sparse_cols ]) results["actions"].extend([ DataCheckAction(action_code=DataCheckActionCode.DROP_COL, metadata={ "column": col_name }).to_dict() for col_name in too_sparse_cols ]) return results
def test_invalid_target_data_input_formats(): invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) # test empty pd.Series X = pd.DataFrame() messages = invalid_targets_check.validate(X, pd.Series()) assert messages == { "warnings": [], "errors": [DataCheckError(message="Target is either empty or fully null.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL, details={}).to_dict()], "actions": [] } expected = { "warnings": [], "errors": [DataCheckError(message="3 row(s) (75.0%) of target values are null", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={"num_null_rows": 3, "pct_null_rows": 75}).to_dict(), DataCheckError(message="Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={"target_values": [0]}).to_dict()], "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "most_frequent"}).to_dict()] } # test Woodwork y = pd.Series([None, None, None, 0]) X = pd.DataFrame({"col": range(len(y))}) messages = invalid_targets_check.validate(X, y) assert messages == expected # test list y = [None, None, None, 0] X = pd.DataFrame({"col": range(len(y))}) messages = invalid_targets_check.validate(X, y) assert messages == expected # test np.array y = np.array([None, None, None, 0]) X = pd.DataFrame({"col": range(len(y))}) messages = invalid_targets_check.validate(X, y) assert messages == expected
def test_id_cols_data_check_input_formats(): id_cols_check = IDColumnsDataCheck(id_threshold=0.8) # test empty pd.DataFrame assert id_cols_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []} # test Woodwork ww_input = ww.DataTable(np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]])) assert id_cols_check.validate(ww_input) == { "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": 0}).to_dict(), DataCheckWarning(message="Column '1' is 80.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": 1}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 0}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 1}).to_dict()] } # test 2D list assert id_cols_check.validate([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]]) == { "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": 0}).to_dict(), DataCheckWarning("Column '1' is 80.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": 1}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 0}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 1}).to_dict()] } # test np.array assert id_cols_check.validate(np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]])) == { "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": 0}).to_dict(), DataCheckWarning(message="Column '1' is 80.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": 1}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 0}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 1}).to_dict()] }
def test_id_columns_warning(): X_dict = {'col_1_id': [0, 1, 2, 3], 'col_2': [2, 3, 4, 5], 'col_3_id': [1, 1, 2, 3], 'Id': [3, 1, 2, 0], 'col_5': [0, 0, 1, 2], 'col_6': [0.1, 0.2, 0.3, 0.4] } X = pd.DataFrame.from_dict(X_dict) id_cols_check = IDColumnsDataCheck(id_threshold=0.95) assert id_cols_check.validate(X) == { "warnings": [DataCheckWarning(message="Column 'Id' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "Id"}).to_dict(), DataCheckWarning(message="Column 'col_1_id' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_1_id"}).to_dict(), DataCheckWarning(message="Column 'col_2' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_2"}).to_dict(), DataCheckWarning(message="Column 'col_3_id' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_3_id"}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_2"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_3_id"}).to_dict()] } X = pd.DataFrame.from_dict(X_dict) id_cols_check = IDColumnsDataCheck(id_threshold=1.0) assert id_cols_check.validate(X) == { "warnings": [DataCheckWarning(message="Column 'Id' is 100.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "Id"}).to_dict(), DataCheckWarning(message="Column 'col_1_id' is 100.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_1_id"}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict()] }
def test_id_columns_strings(): X_dict = {'col_1_id': ["a", "b", "c", "d"], 'col_2': ["w", "x", "y", "z"], 'col_3_id': ["123456789012345", "234567890123456", "3456789012345678", "45678901234567"], 'Id': ["z", "y", "x", "a"], 'col_5': ["0", "0", "1", "2"], 'col_6': [0.1, 0.2, 0.3, 0.4] } X = pd.DataFrame.from_dict(X_dict) id_cols_check = IDColumnsDataCheck(id_threshold=0.95) assert id_cols_check.validate(X) == { "warnings": [DataCheckWarning(message="Column 'Id' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "Id"}).to_dict(), DataCheckWarning(message="Column 'col_1_id' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_1_id"}).to_dict(), DataCheckWarning(message="Column 'col_2' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_2"}).to_dict(), DataCheckWarning(message="Column 'col_3_id' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_3_id"}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_2"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_3_id"}).to_dict()] } id_cols_check = IDColumnsDataCheck(id_threshold=1.0) assert id_cols_check.validate(X) == { "warnings": [DataCheckWarning(message="Column 'Id' is 100.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "Id"}).to_dict(), DataCheckWarning(message="Column 'col_1_id' is 100.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_1_id"}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict()] }
def validate(self, X, y): """Check if the target or any of the features have no variance (1 unique value). Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): The input features. y (ww.DataColumn, pd.Series, np.ndarray): The target data. Returns: dict: dict of warnings/errors corresponding to features or target with no variance. """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) unique_counts = X.nunique(dropna=self._dropnan).to_dict() any_nulls = (X.isnull().any()).to_dict() for col_name in unique_counts: message = self._check_for_errors(col_name, unique_counts[col_name], any_nulls[col_name]) if not message: continue DataCheck._add_message(message, results) results["actions"].append( DataCheckAction(DataCheckActionCode.DROP_COL, details={ "column": col_name }).to_dict()) y_name = getattr(y, "name") if not y_name: y_name = "Y" target_message = self._check_for_errors( y_name, y.nunique(dropna=self._dropnan), y.isnull().any()) if target_message: DataCheck._add_message(target_message, results) return results
"column": "feature" }).to_dict() labels_0_unique = DataCheckError(message="Y has 0 unique value.", data_check_name=no_variance_data_check_name, message_code=DataCheckMessageCode.NO_VARIANCE, details={ "column": "Y" }).to_dict() labels_1_unique = DataCheckError(message="Y has 1 unique value.", data_check_name=no_variance_data_check_name, message_code=DataCheckMessageCode.NO_VARIANCE, details={ "column": "Y" }).to_dict() drop_feature_action = DataCheckAction(DataCheckActionCode.DROP_COL, details={ "column": "feature" }).to_dict() cases = [ (all_distinct_X, all_distinct_y, True, { "warnings": [], "errors": [], "actions": [] }), ([[1], [2], [3], [4]], [1, 2, 3, 2], False, { "warnings": [], "errors": [], "actions": [] }), (np.arange(12).reshape(4, 3), [1, 2, 3], True, { "warnings": [],
def validate(self, X, y): """Checks if the target data contains missing or invalid values. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored. y (ww.DataColumn, pd.Series, np.ndarray): Target data to check for invalid values. Returns: dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data. Example: >>> import pandas as pd >>> X = pd.DataFrame({"col": [1, 2, 3, 1]}) >>> y = pd.Series([0, 1, None, None]) >>> target_check = InvalidTargetDataCheck('binary', 'Log Loss Binary') >>> assert target_check.validate(X, y) == {"errors": [{"message": "2 row(s) (50.0%) of target values are null",\ "data_check_name": "InvalidTargetDataCheck",\ "level": "error",\ "code": "TARGET_HAS_NULL",\ "details": {"num_null_rows": 2, "pct_null_rows": 50}}],\ "warnings": [],\ "actions": [{'code': 'IMPUTE_COL', 'metadata': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]} """ results = {"warnings": [], "errors": [], "actions": []} if y is None: results["errors"].append( DataCheckError( message="Target is None", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_IS_NONE, details={}).to_dict()) return results y = infer_feature_types(y) is_supported_type = y.logical_type in numeric_and_boolean_ww + [ ww.logical_types.Categorical ] if not is_supported_type: results["errors"].append( DataCheckError( message= "Target is unsupported {} type. Valid Woodwork logical types include: {}" .format( y.logical_type, ", ".join([ ltype.type_string for ltype in numeric_and_boolean_ww ])), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={ "unsupported_type": y.logical_type.type_string }).to_dict()) y_df = _convert_woodwork_types_wrapper(y.to_series()) null_rows = y_df.isnull() if null_rows.all(): results["errors"].append( DataCheckError(message="Target is either empty or fully null.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_IS_EMPTY_OR_FULLY_NULL, details={}).to_dict()) return results elif null_rows.any(): num_null_rows = null_rows.sum() pct_null_rows = null_rows.mean() * 100 results["errors"].append( DataCheckError( message="{} row(s) ({}%) of target values are null".format( num_null_rows, pct_null_rows), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={ "num_null_rows": num_null_rows, "pct_null_rows": pct_null_rows }).to_dict()) impute_strategy = "mean" if is_regression( self.problem_type) else "most_frequent" results["actions"].append( DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={ "column": None, "is_target": True, "impute_strategy": impute_strategy }).to_dict()) value_counts = y_df.value_counts() unique_values = value_counts.index.tolist() if is_binary(self.problem_type) and len(value_counts) != 2: if self.n_unique is None: details = {"target_values": unique_values} else: details = { "target_values": unique_values[:min(self.n_unique, len(unique_values))] } results["errors"].append( DataCheckError( message= "Binary class targets require exactly two unique values.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details=details).to_dict()) if self.problem_type == ProblemTypes.REGRESSION and "numeric" not in y.semantic_tags: results["errors"].append( DataCheckError( message= "Target data type should be numeric for regression type problems.", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={}).to_dict()) if is_multiclass(self.problem_type): if value_counts.min() <= 1: least_populated = value_counts[value_counts <= 1] details = { "least_populated_class_labels": least_populated.index.tolist() } results["errors"].append( DataCheckError( message= "Target does not have at least two instances per class which is required for multiclass classification", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, details=details).to_dict()) if len(unique_values) <= 2: details = {"num_classes": len(unique_values)} results["errors"].append( DataCheckError( message= "Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, details=details).to_dict()) num_class_to_num_value_ratio = len(unique_values) / len(y) if num_class_to_num_value_ratio >= self.multiclass_continuous_threshold: details = { "class_to_value_ratio": num_class_to_num_value_ratio } results["warnings"].append( DataCheckWarning( message= "Target has a large number of unique values, could be regression type problem.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details=details).to_dict()) any_neg = not (y_df > 0).all() if y.logical_type in [ ww.logical_types.Integer, ww.logical_types.Double ] else None if any_neg and self.objective.positive_only: details = { "Count of offending values": sum(val <= 0 for val in y_df.values.flatten()) } results["errors"].append( DataCheckError( message= f"Target has non-positive values which is not supported for {self.objective.name}", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_INCOMPATIBLE_OBJECTIVE, details=details).to_dict()) if X is not None: X = infer_feature_types(X) X_index = list(X.to_dataframe().index) y_index = list(y_df.index) X_length = len(X_index) y_length = len(y_index) if X_length != y_length: results["warnings"].append( DataCheckWarning( message= "Input target and features have different lengths", data_check_name=self.name, message_code=DataCheckMessageCode.MISMATCHED_LENGTHS, details={ "features_length": X_length, "target_length": y_length }).to_dict()) if X_index != y_index: if set(X_index) == set(y_index): results["warnings"].append( DataCheckWarning( message= "Input target and features have mismatched indices order", data_check_name=self.name, message_code=DataCheckMessageCode. MISMATCHED_INDICES_ORDER, details={}).to_dict()) else: index_diff_not_in_X = list(set(y_index) - set(X_index))[:10] index_diff_not_in_y = list(set(X_index) - set(y_index))[:10] results["warnings"].append( DataCheckWarning( message= "Input target and features have mismatched indices", data_check_name=self.name, message_code=DataCheckMessageCode. MISMATCHED_INDICES, details={ "indices_not_in_features": index_diff_not_in_X, "indices_not_in_target": index_diff_not_in_y }).to_dict()) return results
def validate(self, X, y=None): """Checks if there are any columns in the input that are too unique in the case of classification problems or not unique enough in the case of regression problems. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Defaults to None. Returns: dict: dict with a DataCheckWarning if there are any too unique or not unique enough columns. Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'regression_unique_enough': [float(x) for x in range(100)], ... 'regression_not_unique_enough': [float(1) for x in range(100)] ... }) >>> uniqueness_check = UniquenessDataCheck(problem_type="regression", threshold=0.8) >>> assert uniqueness_check.validate(df) == {"errors": [],\ "warnings": [{"message": "Input columns (regression_not_unique_enough) for regression problem type are not unique enough.",\ "data_check_name": "UniquenessDataCheck",\ "level": "warning",\ "code": "NOT_UNIQUE_ENOUGH",\ "details": {"column": "regression_not_unique_enough", 'uniqueness_score': 0.0}}],\ "actions": [{"code": "DROP_COL",\ "metadata": {"column": "regression_not_unique_enough"}}]} """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) res = X.apply(UniquenessDataCheck.uniqueness_score) if is_regression(self.problem_type): not_unique_enough_cols = list(res.index[res < self.threshold]) results["warnings"].extend([ DataCheckWarning( message=warning_not_unique_enough.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH, details={ "column": col_name, "uniqueness_score": res.loc[col_name] }).to_dict() for col_name in not_unique_enough_cols ]) results["actions"].extend([ DataCheckAction(action_code=DataCheckActionCode.DROP_COL, metadata={ "column": col_name }).to_dict() for col_name in not_unique_enough_cols ]) elif is_multiclass(self.problem_type): too_unique_cols = list(res.index[res > self.threshold]) results["warnings"].extend([ DataCheckWarning(message=warning_too_unique.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.TOO_UNIQUE, details={ "column": col_name, "uniqueness_score": res.loc[col_name] }).to_dict() for col_name in too_unique_cols ]) results["actions"].extend([ DataCheckAction(action_code=DataCheckActionCode.DROP_COL, metadata={ "column": col_name }).to_dict() for col_name in too_unique_cols ]) return results
def test_target_leakage_data_check_input_formats_pearson(): leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8, method='pearson') # test empty pd.DataFrame, empty pd.Series assert leakage_check.validate(pd.DataFrame(), pd.Series()) == { "warnings": [], "errors": [], "actions": [] } y = pd.Series([1, 0, 1, 1]) X = pd.DataFrame() X["a"] = y * 3 X["b"] = y - 1 X["c"] = y / 10 X["d"] = ~y X["e"] = [0, 0, 0, 0] y = y.astype(bool) expected = { "warnings": [ DataCheckWarning( message= "Column 'a' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "a" }).to_dict(), DataCheckWarning( message= "Column 'b' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "b" }).to_dict(), DataCheckWarning( message= "Column 'c' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "c" }).to_dict(), DataCheckWarning( message= "Column 'd' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "d" }).to_dict() ], "errors": [], "actions": [ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'a' }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'b' }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'c' }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'd' }).to_dict() ] } # test X as np.array assert leakage_check.validate(X.values, y) == { "warnings": [ DataCheckWarning( message= "Column '0' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": 0 }).to_dict(), DataCheckWarning( message= "Column '1' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": 1 }).to_dict(), DataCheckWarning( message= "Column '2' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": 2 }).to_dict(), DataCheckWarning( message= "Column '3' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": 3 }).to_dict() ], "errors": [], "actions": [ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 0 }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 1 }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 2 }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 3 }).to_dict() ] } # test X as ww.DataTable, y as ww.DataColumn assert leakage_check.validate(ww.DataTable(X), ww.DataColumn(y)) == expected # test y as list assert leakage_check.validate(X, y.values) == expected
def test_target_leakage_data_check_warnings_pearson(): y = pd.Series([1, 0, 1, 1]) X = pd.DataFrame() X["a"] = y * 3 X["b"] = y - 1 X["c"] = y / 10 X["d"] = ~y X["e"] = [0, 0, 0, 0] y = y.astype(bool) leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5, method='pearson') assert leakage_check.validate(X, y) == { "warnings": [ DataCheckWarning( message= "Column 'a' is 50.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "a" }).to_dict(), DataCheckWarning( message= "Column 'b' is 50.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "b" }).to_dict(), DataCheckWarning( message= "Column 'c' is 50.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "c" }).to_dict(), DataCheckWarning( message= "Column 'd' is 50.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "d" }).to_dict() ], "errors": [], "actions": [ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'a' }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'b' }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'c' }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'd' }).to_dict() ] } y = ["a", "b", "a", "a"] leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5, method='pearson') assert leakage_check.validate(X, y) == { "warnings": [], "errors": [], "actions": [] }
def test_target_leakage_regression(): leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8) # test empty pd.DataFrame, empty pd.Series assert leakage_check.validate(pd.DataFrame(), pd.Series()) == { "warnings": [], "errors": [], "actions": [] } y = pd.Series([ 0.4, 0.1, 2.3, 4.3, 2.2, 1.8, 3.7, 3.6, 2.4, 0.9, 3.1, 2.8, 4.1, 1.6, 1.2 ]) X = pd.DataFrame() X["a"] = y * 3 X["b"] = y - 1 X["c"] = y / 10 X["d"] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] X["e"] = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o" ] expected = { "warnings": [ DataCheckWarning( message= "Column 'a' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "a" }).to_dict(), DataCheckWarning( message= "Column 'b' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "b" }).to_dict(), DataCheckWarning( message= "Column 'c' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "c" }).to_dict(), DataCheckWarning( message= "Column 'e' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "e" }).to_dict() ], "errors": [], "actions": [ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'a' }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'b' }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'c' }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'e' }).to_dict() ] } # test X as ww.DataTable, y as ww.DataColumn assert leakage_check.validate(ww.DataTable(X), ww.DataColumn(y)) == expected # test y as list assert leakage_check.validate(X, y.values) == expected
def test_target_leakage_types(): leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8) y = pd.Series([1, 0, 1, 1]) X = pd.DataFrame() X["a"] = ["a", "b", "a", "a"] X["b"] = y - 1 X["c"] = [ datetime.strptime("2015", "%Y"), datetime.strptime("2016", "%Y"), datetime.strptime("2015", "%Y"), datetime.strptime("2015", "%Y") ] X["d"] = ~y X["e"] = [0, 0, 0, 0] y = y.astype(bool) expected = { "warnings": [ DataCheckWarning( message= "Column 'a' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "a" }).to_dict(), DataCheckWarning( message= "Column 'b' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "b" }).to_dict(), DataCheckWarning( message= "Column 'c' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "c" }).to_dict(), DataCheckWarning( message= "Column 'd' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "d" }).to_dict() ], "errors": [], "actions": [ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'a' }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'b' }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'c' }).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'd' }).to_dict() ] } assert leakage_check.validate(X, y) == expected