def test_default_data_checks_regression(input_type): X = pd.DataFrame({ 'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 5, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100] }) y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_no_variance = ww.DataColumn(y_no_variance) null_leakage = [ DataCheckWarning( message= "Column 'lots_of_null' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "lots_of_null" }).to_dict() ] data_checks = DefaultDataChecks( "regression", get_default_primary_search_objective("regression")) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] } # Skip Invalid Target assert data_checks.validate(X, y_no_variance) == { "warnings": messages[:3] + null_leakage, "errors": messages[4:] + [ DataCheckError(message="Y has 1 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={ "column": "Y" }).to_dict() ] } data_checks = DataChecks( DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, { "InvalidTargetDataCheck": { "problem_type": "regression", "objective": get_default_primary_search_objective("regression") } }) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] }
def test_default_data_checks_regression(input_type): X = pd.DataFrame({'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 5, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100], 'natural_language_nan': [None, "string_that_is_long_enough_for_natural_language_1", "string_that_is_long_enough_for_natural_language_2", "string_that_is_long_enough_for_natural_language_3", "string_that_is_long_enough_for_natural_language_4"], 'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5))}) X['nan_dt_col'][0] = None y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_no_variance = ww.DataColumn(y_no_variance) null_leakage = [DataCheckWarning(message="Column 'lots_of_null' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "lots_of_null"}).to_dict()] data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression")) id_leakage_warning = [DataCheckWarning(message="Column 'id' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "id"}).to_dict()] nan_dt_leakage_warning = [DataCheckWarning(message="Column 'nan_dt_col' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "nan_dt_col"}).to_dict()] impute_action = DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, 'impute_strategy': 'mean'}).to_dict() nan_dt_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'nan_dt_col'}).to_dict() expected_actions_with_drop_and_impute = expected_actions[:3] + [nan_dt_action, impute_action] + expected_actions[4:] assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, "errors": messages[3:], "actions": expected_actions_with_drop_and_impute} # Skip Invalid Target assert data_checks.validate(X, y_no_variance) == { "warnings": messages[:3] + null_leakage, "errors": messages[4:7] + [DataCheckError(message="Y has 1 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": "Y"}).to_dict()] + messages[7:], "actions": expected_actions[:3] + expected_actions[4:] } data_checks = DataChecks(DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {"InvalidTargetDataCheck": {"problem_type": "regression", "objective": get_default_primary_search_objective("regression")}}) assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, "errors": messages[3:], "actions": expected_actions_with_drop_and_impute}
def test_default_data_checks_null_rows(): class SeriesWrap(): def __init__(self, series): self.series = series def __eq__(self, series_2): return all(self.series.eq(series_2.series)) X = pd.DataFrame({'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None]}) y = pd.Series([0, 1, np.nan, 1, 0]) data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression")) highly_null_rows = SeriesWrap(pd.Series([1.0, 1.0, 1.0, 1.0, 1.0])) expected = { "warnings": [DataCheckWarning(message="5 out of 5 rows are more than 95.0% null", data_check_name="HighlyNullDataCheck", message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, details={"pct_null_cols": highly_null_rows}).to_dict(), DataCheckWarning(message="Column 'all_null' is 95.0% or more null", data_check_name="HighlyNullDataCheck", message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": 'all_null', "pct_null_rows": 1.0}).to_dict(), DataCheckWarning(message="Column 'also_all_null' is 95.0% or more null", data_check_name="HighlyNullDataCheck", message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": 'also_all_null', "pct_null_rows": 1.0}).to_dict()], "errors": [DataCheckError(message="1 row(s) (20.0%) of target values are null", data_check_name="InvalidTargetDataCheck", message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={"num_null_rows": 1, "pct_null_rows": 20.0}).to_dict(), DataCheckError(message="all_null has 0 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": "all_null"}).to_dict(), DataCheckError(message="also_all_null has 0 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": "also_all_null"}).to_dict()], "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3, 4]}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'also_all_null'}).to_dict(), DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "mean"}).to_dict()]} validation_results = data_checks.validate(X, y) validation_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validation_results['warnings'][0]['details']['pct_null_cols']) assert validation_results == expected
def test_default_data_checks_classification(input_type): X = pd.DataFrame({ 'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 4, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100], 'natural_language_nan': [ None, "string_that_is_long_enough_for_natural_language_1", "string_that_is_long_enough_for_natural_language_2", "string_that_is_long_enough_for_natural_language_3", "string_that_is_long_enough_for_natural_language_4" ], 'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5)) }) X['nan_dt_col'][0] = None y = pd.Series([0, 1, np.nan, 1, 0]) y_multiclass = pd.Series([0, 1, np.nan, 2, 0]) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_multiclass = ww.DataColumn(y_multiclass) data_checks = DefaultDataChecks( "binary", get_default_primary_search_objective("binary")) imbalance = [ DataCheckError( message= "The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0.0, 1.0]", data_check_name="ClassImbalanceDataCheck", message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, details={ "target_values": [0.0, 1.0] }).to_dict() ] assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] + imbalance, "actions": expected_actions } data_checks = DataChecks( DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, { "InvalidTargetDataCheck": { "problem_type": "binary", "objective": get_default_primary_search_objective("binary") } }) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:], "actions": expected_actions } # multiclass imbalance = [ DataCheckError( message= "The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0.0, 2.0, 1.0]", data_check_name="ClassImbalanceDataCheck", message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, details={ "target_values": [0.0, 2.0, 1.0] }).to_dict() ] min_2_class_count = [ DataCheckError( message= "Target does not have at least two instances per class which is required for multiclass classification", data_check_name="InvalidTargetDataCheck", message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, details={ "least_populated_class_labels": [2.0, 1.0] }).to_dict() ] high_class_to_sample_ratio = [ DataCheckWarning( message= "Target has a large number of unique values, could be regression type problem.", data_check_name="InvalidTargetDataCheck", message_code=DataCheckMessageCode. TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details={ 'class_to_value_ratio': 0.6 }).to_dict() ] # multiclass data_checks = DefaultDataChecks( "multiclass", get_default_primary_search_objective("multiclass")) assert data_checks.validate(X, y_multiclass) == { "warnings": messages[:3] + high_class_to_sample_ratio, "errors": [messages[3]] + min_2_class_count + messages[4:] + imbalance, "actions": expected_actions } data_checks = DataChecks( DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, { "InvalidTargetDataCheck": { "problem_type": "multiclass", "objective": get_default_primary_search_objective("multiclass") } }) assert data_checks.validate(X, y_multiclass) == { "warnings": messages[:3] + high_class_to_sample_ratio, "errors": [messages[3]] + min_2_class_count + messages[4:], "actions": expected_actions }