def test_default_data_checks_regression(input_type): X = pd.DataFrame({ 'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 5, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100] }) y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_no_variance = ww.DataColumn(y_no_variance) null_leakage = [ DataCheckWarning( message= "Column 'lots_of_null' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "lots_of_null" }).to_dict() ] data_checks = DefaultDataChecks( "regression", get_default_primary_search_objective("regression")) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] } # Skip Invalid Target assert data_checks.validate(X, y_no_variance) == { "warnings": messages[:3] + null_leakage, "errors": messages[4:] + [ DataCheckError(message="Y has 1 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={ "column": "Y" }).to_dict() ] } data_checks = DataChecks( DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, { "InvalidTargetDataCheck": { "problem_type": "regression", "objective": get_default_primary_search_objective("regression") } }) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] }
def test_default_data_checks_regression(input_type): X = pd.DataFrame({'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 5, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100], 'natural_language_nan': [None, "string_that_is_long_enough_for_natural_language_1", "string_that_is_long_enough_for_natural_language_2", "string_that_is_long_enough_for_natural_language_3", "string_that_is_long_enough_for_natural_language_4"], 'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5))}) X['nan_dt_col'][0] = None y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_no_variance = ww.DataColumn(y_no_variance) null_leakage = [DataCheckWarning(message="Column 'lots_of_null' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "lots_of_null"}).to_dict()] data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression")) id_leakage_warning = [DataCheckWarning(message="Column 'id' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "id"}).to_dict()] nan_dt_leakage_warning = [DataCheckWarning(message="Column 'nan_dt_col' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "nan_dt_col"}).to_dict()] impute_action = DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, 'impute_strategy': 'mean'}).to_dict() nan_dt_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'nan_dt_col'}).to_dict() expected_actions_with_drop_and_impute = expected_actions[:3] + [nan_dt_action, impute_action] + expected_actions[4:] assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, "errors": messages[3:], "actions": expected_actions_with_drop_and_impute} # Skip Invalid Target assert data_checks.validate(X, y_no_variance) == { "warnings": messages[:3] + null_leakage, "errors": messages[4:7] + [DataCheckError(message="Y has 1 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": "Y"}).to_dict()] + messages[7:], "actions": expected_actions[:3] + expected_actions[4:] } data_checks = DataChecks(DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {"InvalidTargetDataCheck": {"problem_type": "regression", "objective": get_default_primary_search_objective("regression")}}) assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, "errors": messages[3:], "actions": expected_actions_with_drop_and_impute}
def test_default_data_checks_time_series_regression(): regression_data_check_classes = [ check.__class__ for check in DefaultDataChecks( "regression", get_default_primary_search_objective( "regression")).data_checks ] ts_regression_data_check_classes = [ check.__class__ for check in DefaultDataChecks( "time series regression", get_default_primary_search_objective( "time series regression")).data_checks ] assert regression_data_check_classes == ts_regression_data_check_classes
def test_invalid_target_data_check_numeric_binary_classification_error(): y = pd.Series([1, 5, 1, 5, 1, 1]) X = pd.DataFrame({"col": range(len(y))}) invalid_targets_check = InvalidTargetDataCheck( "binary", get_default_primary_search_objective("binary")) assert invalid_targets_check.validate(X, y) == { "warnings": [ DataCheckWarning( message= "Numerical binary classification target classes must be [0, 1], got [1, 5] instead", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_INVALID_VALUES, details={ "target_values": [1, 5] }).to_dict() ], "errors": [] } y = pd.Series([0, 5, np.nan, np.nan]) X = pd.DataFrame({"col": range(len(y))}) assert invalid_targets_check.validate(X, y) == { "warnings": [ DataCheckWarning( message= "Numerical binary classification target classes must be [0, 1], got [5.0, 0.0] instead", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_INVALID_VALUES, details={ "target_values": [5.0, 0.0] }).to_dict() ], "errors": [ DataCheckError( message="2 row(s) (50.0%) of target values are null", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={ "num_null_rows": 2, "pct_null_rows": 50 }).to_dict() ] } y = pd.Series([0, 1, 1, 0, 1, 2]) X = pd.DataFrame({"col": range(len(y))}) assert invalid_targets_check.validate(X, y) == { "warnings": [], "errors": [ DataCheckError( message= "Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode. TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={ "target_values": [1, 0, 2] }).to_dict() ] }
def test_invalid_target_data_check_multiclass_problem_almostcontinuous_data(): invalid_targets_check = InvalidTargetDataCheck("multiclass", get_default_primary_search_objective("multiclass")) y_multiclass_high_classes = pd.Series(list(range(0, 100)) * 3) # 100 classes, 300 samples, .33 class/sample ratio X = pd.DataFrame({"col": range(len(y_multiclass_high_classes))}) data_check_error = DataCheckWarning( message=f"Target has a large number of unique values, could be regression type problem.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details={"class_to_value_ratio": 1 / 3}).to_dict() assert invalid_targets_check.validate(X, y=y_multiclass_high_classes) == {"warnings": [data_check_error], "errors": []} y_multiclass_med_classes = pd.Series(list(range(0, 5)) * 20) # 5 classes, 100 samples, .05 class/sample ratio X = pd.DataFrame({"col": range(len(y_multiclass_med_classes))}) data_check_error = DataCheckWarning( message=f"Target has a large number of unique values, could be regression type problem.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details={"class_to_value_ratio": .05}).to_dict() assert invalid_targets_check.validate(X, y=y_multiclass_med_classes) == {"warnings": [data_check_error], "errors": []} y_multiclass_low_classes = pd.Series(list(range(0, 3)) * 100) # 2 classes, 300 samples, .01 class/sample ratio X = pd.DataFrame({"col": range(len(y_multiclass_low_classes))}) assert invalid_targets_check.validate(X, y=y_multiclass_low_classes) == {"warnings": [], "errors": []}
def test_invalid_target_data_check_invalid_pandas_data_types_error(pd_type): y = pd.Series([0, 1, 0, 0, 1, 0, 1, 0]) y = y.astype(pd_type) X = pd.DataFrame({"col": range(len(y))}) invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) assert invalid_targets_check.validate(X, y) == {"warnings": [], "errors": [], "actions": []} y = pd.Series(pd.date_range('2000-02-03', periods=5, freq='W')) X = pd.DataFrame({"col": range(len(y))}) unique_values = y.value_counts().index.tolist() assert invalid_targets_check.validate(X, y) == { "warnings": [], "errors": [DataCheckError(message="Target is unsupported {} type. Valid Woodwork logical types include: {}" .format("Datetime", ", ".join([ltype.type_string for ltype in numeric_and_boolean_ww])), data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={"unsupported_type": "datetime"}).to_dict(), DataCheckError(message="Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={"target_values": unique_values}).to_dict()], "actions": [] }
def test_invalid_target_data_check_different_lengths(): X = pd.DataFrame({"col": [1, 2, 3]}) y_diff_len = pd.Series([0, 1]) invalid_targets_check = InvalidTargetDataCheck( "binary", get_default_primary_search_objective("binary")) assert invalid_targets_check.validate(X, y_diff_len) == { "warnings": [ DataCheckWarning( message="Input target and features have different lengths", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.MISMATCHED_LENGTHS, details={ "features_length": len(X.index), "target_length": len(y_diff_len.index) }).to_dict(), DataCheckWarning( message="Input target and features have mismatched indices", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.MISMATCHED_INDICES, details={ "indices_not_in_features": [], "indices_not_in_target": [2] }).to_dict() ], "errors": [], "actions": [] }
def test_invalid_target_data_check_multiclass_two_examples_per_class(): y = pd.Series([0] + [1] * 19 + [2] * 80) X = pd.DataFrame({"col": range(len(y))}) invalid_targets_check = InvalidTargetDataCheck("multiclass", get_default_primary_search_objective("binary")) expected_message = "Target does not have at least two instances per class which is required for multiclass classification" # with 1 class not having min 2 instances assert invalid_targets_check.validate(X, y) == { "warnings": [], "errors": [DataCheckError(message=expected_message, data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, details={"least_populated_class_labels": [0]}).to_dict()], "actions": [] } y = pd.Series([0] + [1] + [2] * 98) X = pd.DataFrame({"col": range(len(y))}) # with 2 classes not having min 2 instances assert invalid_targets_check.validate(X, y) == { "warnings": [], "errors": [DataCheckError(message=expected_message, data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, details={"least_populated_class_labels": [0, 1]}).to_dict()], "actions": [] }
def test_invalid_target_data_check_multiclass_problem_binary_data(): y_multiclass = pd.Series([1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] * 25) y_binary = pd.Series([0, 1, 1, 1, 0, 0] * 25) data_check_error = DataCheckError( message= f"Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, details={ "num_classes": len(set(y_binary)) }).to_dict() invalid_targets_check = InvalidTargetDataCheck( "multiclass", get_default_primary_search_objective("multiclass")) assert invalid_targets_check.validate(X=pd.DataFrame( {"col": range(len(y_multiclass))}), y=y_multiclass) == { "warnings": [], "errors": [], "actions": [] } assert invalid_targets_check.validate(X=pd.DataFrame( {"col": range(len(y_binary))}), y=y_binary) == { "warnings": [], "errors": [data_check_error], "actions": [] }
def test_invalid_target_data_action_for_data_with_null(problem_type): y = pd.Series([None, None, None, 0, 0, 0, 0, 0, 0, 0]) X = pd.DataFrame({"col": range(len(y))}) invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type)) impute_strategy = "mean" if is_regression(problem_type) else "most_frequent" expected = { "warnings": [], "errors": [DataCheckError(message="3 row(s) (30.0%) of target values are null", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={"num_null_rows": 3, "pct_null_rows": 30.0}).to_dict()], "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict()] } if is_binary(problem_type): expected["errors"].append(DataCheckError(message="Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={"target_values": [0]}).to_dict()) elif is_multiclass(problem_type): expected["errors"].append(DataCheckError(message=f"Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, details={"num_classes": 1}).to_dict()) expected["warnings"].append(DataCheckWarning(message=f"Target has a large number of unique values, could be regression type problem.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details={"class_to_value_ratio": 0.1}).to_dict()) messages = invalid_targets_check.validate(X, y) assert messages == expected
def test_invalid_target_data_check_nan_error(): X = pd.DataFrame({"col": [1, 2, 3]}) invalid_targets_check = InvalidTargetDataCheck( "regression", get_default_primary_search_objective("regression")) assert invalid_targets_check.validate(X, y=pd.Series([1, 2, 3])) == { "warnings": [], "errors": [], "actions": [] } assert invalid_targets_check.validate( X, y=pd.Series([np.nan, np.nan, np.nan])) == { "warnings": [], "errors": [ DataCheckError( message="3 row(s) (100.0%) of target values are null", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={ "num_null_rows": 3, "pct_null_rows": 100 }).to_dict() ], "actions": [] }
def test_invalid_target_data_check_invalid_n_unique(): with pytest.raises( ValueError, match="`n_unique` must be a non-negative integer value."): InvalidTargetDataCheck( "regression", get_default_primary_search_objective("regression"), n_unique=-1)
def test_invalid_target_data_check_n_unique(problem_type): y = pd.Series(list(range(100, 200)) + list(range(200))) unique_values = y.value_counts().index.tolist()[:100] # n_unique defaults to 100 X = pd.DataFrame({"col": range(len(y))}) invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type)) # Test default value of n_unique assert invalid_targets_check.validate(X, y) == { "warnings": [], "errors": [DataCheckError(message="Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={"target_values": unique_values}).to_dict()], "actions": [] } # Test number of unique values < n_unique y = pd.Series(range(20)) X = pd.DataFrame({"col": range(len(y))}) unique_values = y.value_counts().index.tolist() assert invalid_targets_check.validate(X, y) == { "warnings": [], "errors": [DataCheckError(message="Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={"target_values": unique_values}).to_dict()], "actions": [] } # Test n_unique is None invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary"), n_unique=None) y = pd.Series(range(150)) X = pd.DataFrame({"col": range(len(y))}) unique_values = y.value_counts().index.tolist() assert invalid_targets_check.validate(X, y) == { "warnings": [], "errors": [DataCheckError(message="Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={"target_values": unique_values}).to_dict()], "actions": [] }
def test_invalid_target_data_check_numeric_binary_does_not_return_warnings(): y = pd.Series([1, 5, 1, 5, 1, 1]) X = pd.DataFrame({"col": range(len(y))}) invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) assert invalid_targets_check.validate(X, y) == { "warnings": [], "errors": [], "actions": [] }
def test_invalid_target_data_check_numeric_binary_classification_valid_float(): X = pd.DataFrame() invalid_targets_check = InvalidTargetDataCheck( "binary", get_default_primary_search_objective("binary")) assert invalid_targets_check.validate(X, y=pd.Series([0.0, 1.0, 0.0, 1.0])) == { "warnings": [], "errors": [] }
def test_invalid_target_y_none(): invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) assert invalid_targets_check.validate(pd.DataFrame(), y=None) == { "warnings": [], "errors": [DataCheckError(message="Target is None", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_IS_NONE, details={}).to_dict()], "actions": [] }
def test_invalid_target_data_check_regression_problem_nonnumeric_data( problem_type): y_categorical = pd.Series(["Peace", "Is", "A", "Lie"] * 100) y_mixed_cat_numeric = pd.Series(["Peace", 2, "A", 4] * 100) y_integer = pd.Series([1, 2, 3, 4]) y_float = pd.Series([1.1, 2.2, 3.3, 4.4]) y_numeric = pd.Series([1, 2.2, 3, 4.4]) data_check_error = DataCheckError( message= f"Target data type should be numeric for regression type problems.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={}).to_dict() invalid_targets_check = InvalidTargetDataCheck( problem_type, get_default_primary_search_objective(problem_type)) assert invalid_targets_check.validate(X=pd.DataFrame( {"col": range(len(y_categorical))}), y=y_categorical) == { "warnings": [], "errors": [data_check_error], "actions": [] } assert invalid_targets_check.validate(X=pd.DataFrame( {"col": range(len(y_mixed_cat_numeric))}), y=y_mixed_cat_numeric) == { "warnings": [], "errors": [data_check_error], "actions": [] } assert invalid_targets_check.validate(X=pd.DataFrame( {"col": range(len(y_integer))}), y=y_integer) == { "warnings": [], "errors": [], "actions": [] } assert invalid_targets_check.validate(X=pd.DataFrame( {"col": range(len(y_float))}), y=y_float) == { "warnings": [], "errors": [], "actions": [] } assert invalid_targets_check.validate(X=pd.DataFrame( {"col": range(len(y_numeric))}), y=y_numeric) == { "warnings": [], "errors": [], "actions": [] }
def test_invalid_target_data_check_nan_error(): X = pd.DataFrame({"col": [1, 2, 3]}) invalid_targets_check = InvalidTargetDataCheck("regression", get_default_primary_search_objective("regression")) assert invalid_targets_check.validate(X, y=pd.Series([1, 2, 3])) == {"warnings": [], "errors": [], "actions": []} assert invalid_targets_check.validate(X, y=pd.Series([np.nan, np.nan, np.nan])) == { "warnings": [], "errors": [DataCheckError(message="Target is either empty or fully null.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL, details={}).to_dict()], "actions": [] }
def test_invalid_target_data_action_for_all_null(problem_type): invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type)) y_all_null = pd.Series([None, None, None]) X = pd.DataFrame({"col": range(len(y_all_null))}) expected = { "warnings": [], "errors": [DataCheckError(message="Target is either empty or fully null.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL, details={}).to_dict()], "actions": [] } messages = invalid_targets_check.validate(X, y_all_null) assert messages == expected
def test_invalid_target_data_input_formats(): invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) # test empty pd.Series X = pd.DataFrame() messages = invalid_targets_check.validate(X, pd.Series()) assert messages == { "warnings": [], "errors": [DataCheckError(message="Target is either empty or fully null.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL, details={}).to_dict()], "actions": [] } expected = { "warnings": [], "errors": [DataCheckError(message="3 row(s) (75.0%) of target values are null", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={"num_null_rows": 3, "pct_null_rows": 75}).to_dict(), DataCheckError(message="Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={"target_values": [0]}).to_dict()], "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "most_frequent"}).to_dict()] } # test Woodwork y = pd.Series([None, None, None, 0]) X = pd.DataFrame({"col": range(len(y))}) messages = invalid_targets_check.validate(X, y) assert messages == expected # test list y = [None, None, None, 0] X = pd.DataFrame({"col": range(len(y))}) messages = invalid_targets_check.validate(X, y) assert messages == expected # test np.array y = np.array([None, None, None, 0]) X = pd.DataFrame({"col": range(len(y))}) messages = invalid_targets_check.validate(X, y) assert messages == expected
def test_default_data_checks_null_rows(): class SeriesWrap(): def __init__(self, series): self.series = series def __eq__(self, series_2): return all(self.series.eq(series_2.series)) X = pd.DataFrame({'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None]}) y = pd.Series([0, 1, np.nan, 1, 0]) data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression")) highly_null_rows = SeriesWrap(pd.Series([1.0, 1.0, 1.0, 1.0, 1.0])) expected = { "warnings": [DataCheckWarning(message="5 out of 5 rows are more than 95.0% null", data_check_name="HighlyNullDataCheck", message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, details={"pct_null_cols": highly_null_rows}).to_dict(), DataCheckWarning(message="Column 'all_null' is 95.0% or more null", data_check_name="HighlyNullDataCheck", message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": 'all_null', "pct_null_rows": 1.0}).to_dict(), DataCheckWarning(message="Column 'also_all_null' is 95.0% or more null", data_check_name="HighlyNullDataCheck", message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": 'also_all_null', "pct_null_rows": 1.0}).to_dict()], "errors": [DataCheckError(message="1 row(s) (20.0%) of target values are null", data_check_name="InvalidTargetDataCheck", message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={"num_null_rows": 1, "pct_null_rows": 20.0}).to_dict(), DataCheckError(message="all_null has 0 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": "all_null"}).to_dict(), DataCheckError(message="also_all_null has 0 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": "also_all_null"}).to_dict()], "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3, 4]}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'also_all_null'}).to_dict(), DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "mean"}).to_dict()]} validation_results = data_checks.validate(X, y) validation_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validation_results['warnings'][0]['details']['pct_null_cols']) assert validation_results == expected
def test_invalid_target_data_check_mismatched_indices(): X = pd.DataFrame({"col": [1, 2, 3]}) y_same_index = pd.Series([1, 0, 1]) y_diff_index = pd.Series([0, 1, 0], index=[1, 5, 10]) y_diff_index_order = pd.Series([0, 1, 0], index=[0, 2, 1]) invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) assert invalid_targets_check.validate(X=None, y=y_same_index) == {"warnings": [], "errors": [], "actions": []} assert invalid_targets_check.validate(X, y_same_index) == {"warnings": [], "errors": [], "actions": []} X_index_missing = list(set(y_diff_index.index) - set(X.index)) y_index_missing = list(set(X.index) - set(y_diff_index.index)) assert invalid_targets_check.validate(X, y_diff_index) == { "warnings": [DataCheckWarning(message="Input target and features have mismatched indices", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.MISMATCHED_INDICES, details={"indices_not_in_features": X_index_missing, "indices_not_in_target": y_index_missing}).to_dict()], "errors": [], "actions": [] } assert invalid_targets_check.validate(X, y_diff_index_order) == { "warnings": [DataCheckWarning(message="Input target and features have mismatched indices order", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.MISMATCHED_INDICES_ORDER, details={}).to_dict()], "errors": [], "actions": [] } # Test that we only store ten mismatches when there are more than 10 differences in indices found X_large = pd.DataFrame({"col": range(20)}) y_more_than_ten_diff_indices = pd.Series([0, 1] * 10, index=range(20, 40)) X_index_missing = list(set(y_more_than_ten_diff_indices.index) - set(X.index)) y_index_missing = list(set(X_large.index) - set(y_more_than_ten_diff_indices.index)) assert invalid_targets_check.validate(X_large, y_more_than_ten_diff_indices) == { "warnings": [DataCheckWarning(message="Input target and features have mismatched indices", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.MISMATCHED_INDICES, details={"indices_not_in_features": X_index_missing[:10], "indices_not_in_target": y_index_missing[:10]}).to_dict()], "errors": [], "actions": [] }
def test_invalid_target_data_input_formats(): invalid_targets_check = InvalidTargetDataCheck( "binary", get_default_primary_search_objective("binary")) X = pd.DataFrame() # test empty pd.Series messages = invalid_targets_check.validate(X, pd.Series()) assert messages == { "warnings": [], "errors": [ DataCheckError( message= "Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode. TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={ "target_values": [] }).to_dict() ] } # test Woodwork messages = invalid_targets_check.validate(X, pd.Series([None, None, None, 0])) assert messages == { "warnings": [], "errors": [ DataCheckError( message="3 row(s) (75.0%) of target values are null", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={ "num_null_rows": 3, "pct_null_rows": 75 }).to_dict(), DataCheckError( message= "Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode. TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={ "target_values": [0] }).to_dict() ] } # test list messages = invalid_targets_check.validate(X, [None, None, None, 0]) assert messages == { "warnings": [], "errors": [ DataCheckError( message="3 row(s) (75.0%) of target values are null", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={ "num_null_rows": 3, "pct_null_rows": 75 }).to_dict(), DataCheckError( message= "Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode. TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={ "target_values": [0] }).to_dict() ] } # test np.array messages = invalid_targets_check.validate(X, np.array([None, None, None, 0])) assert messages == { "warnings": [], "errors": [ DataCheckError( message="3 row(s) (75.0%) of target values are null", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={ "num_null_rows": 3, "pct_null_rows": 75 }).to_dict(), DataCheckError( message= "Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode. TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={ "target_values": [0] }).to_dict() ] }
def test_invalid_target_y_none(): invalid_targets_check = InvalidTargetDataCheck( "binary", get_default_primary_search_objective("binary")) with pytest.raises(ValueError, match="y cannot be None"): invalid_targets_check.validate(pd.DataFrame(), y=None)
def test_default_data_checks_classification(input_type): X = pd.DataFrame({ 'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 4, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100], 'natural_language_nan': [ None, "string_that_is_long_enough_for_natural_language_1", "string_that_is_long_enough_for_natural_language_2", "string_that_is_long_enough_for_natural_language_3", "string_that_is_long_enough_for_natural_language_4" ], 'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5)) }) X['nan_dt_col'][0] = None y = pd.Series([0, 1, np.nan, 1, 0]) y_multiclass = pd.Series([0, 1, np.nan, 2, 0]) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_multiclass = ww.DataColumn(y_multiclass) data_checks = DefaultDataChecks( "binary", get_default_primary_search_objective("binary")) imbalance = [ DataCheckError( message= "The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0.0, 1.0]", data_check_name="ClassImbalanceDataCheck", message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, details={ "target_values": [0.0, 1.0] }).to_dict() ] assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] + imbalance, "actions": expected_actions } data_checks = DataChecks( DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, { "InvalidTargetDataCheck": { "problem_type": "binary", "objective": get_default_primary_search_objective("binary") } }) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:], "actions": expected_actions } # multiclass imbalance = [ DataCheckError( message= "The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0.0, 2.0, 1.0]", data_check_name="ClassImbalanceDataCheck", message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, details={ "target_values": [0.0, 2.0, 1.0] }).to_dict() ] min_2_class_count = [ DataCheckError( message= "Target does not have at least two instances per class which is required for multiclass classification", data_check_name="InvalidTargetDataCheck", message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, details={ "least_populated_class_labels": [2.0, 1.0] }).to_dict() ] high_class_to_sample_ratio = [ DataCheckWarning( message= "Target has a large number of unique values, could be regression type problem.", data_check_name="InvalidTargetDataCheck", message_code=DataCheckMessageCode. TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details={ 'class_to_value_ratio': 0.6 }).to_dict() ] # multiclass data_checks = DefaultDataChecks( "multiclass", get_default_primary_search_objective("multiclass")) assert data_checks.validate(X, y_multiclass) == { "warnings": messages[:3] + high_class_to_sample_ratio, "errors": [messages[3]] + min_2_class_count + messages[4:] + imbalance, "actions": expected_actions } data_checks = DataChecks( DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, { "InvalidTargetDataCheck": { "problem_type": "multiclass", "objective": get_default_primary_search_objective("multiclass") } }) assert data_checks.validate(X, y_multiclass) == { "warnings": messages[:3] + high_class_to_sample_ratio, "errors": [messages[3]] + min_2_class_count + messages[4:], "actions": expected_actions }