示例#1
0
    def _check_for_errors(self, column_name, count_unique, any_nulls):
        """Checks if a column has no variance.

        Arguments:
            column_name (str): Name of the column we are checking.
            count_unique (float): Number of unique values in this column.
            any_nulls (bool): Whether this column has any missing data.

        Returns:
            DataCheckError if the column has no variance or DataCheckWarning if the column has two unique values including NaN.
        """
        message = f"{column_name} has {int(count_unique)} unique value."

        if count_unique <= 1:
            return DataCheckError(
                message=message.format(name=column_name),
                data_check_name=self.name,
                message_code=DataCheckMessageCode.NO_VARIANCE,
                details={"column": column_name})

        elif count_unique == 2 and not self._dropnan and any_nulls:
            return DataCheckWarning(
                message=f"{column_name} has two unique values including nulls. "
                "Consider encoding the nulls for "
                "this column to be useful for machine learning.",
                data_check_name=self.name,
                message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL,
                details={"column": column_name})
示例#2
0
def test_multicollinearity_returns_warning():
    col = pd.Series([1, 0, 2, 3, 4])
    X = pd.DataFrame({
        'col_1': col,
        'col_2': col * 3,
        'col_3': ~col,
        'col_4': col / 2,
        'col_5': col + 1,
        'not_collinear': [0, 1, 0, 0, 0]
    })

    multi_check = MulticollinearityDataCheck(threshold=0.95)
    assert multi_check.validate(X) == {
        "warnings": [
            DataCheckWarning(
                message=
                "Columns are likely to be correlated: [('col_1', 'col_2'), ('col_1', 'col_3'), ('col_1', 'col_4'), ('col_1', 'col_5'), ('col_2', 'col_3'), ('col_2', 'col_4'), ('col_2', 'col_5'), ('col_3', 'col_4'), ('col_3', 'col_5'), ('col_4', 'col_5')]",
                data_check_name=multi_data_check_name,
                message_code=DataCheckMessageCode.IS_MULTICOLLINEAR,
                details={
                    'columns': [('col_1', 'col_2'), ('col_1', 'col_3'),
                                ('col_1', 'col_4'), ('col_1', 'col_5'),
                                ('col_2', 'col_3'), ('col_2', 'col_4'),
                                ('col_2', 'col_5'), ('col_3', 'col_4'),
                                ('col_3', 'col_5'), ('col_4', 'col_5')]
                }).to_dict()
        ],
        "errors": []
    }
示例#3
0
def test_outliers_data_check_warnings():
    a = np.arange(10) * 0.01
    data = np.tile(a, (100, 10))

    X = pd.DataFrame(data=data)
    X.iloc[0, 3] = 1000
    X.iloc[3, 25] = 1000
    X.iloc[5, 55] = 10000
    X.iloc[10, 72] = -1000
    X.iloc[:, 90] = 'string_values'

    outliers_check = OutliersDataCheck()
    assert outliers_check.validate(X) == {
        "warnings": [
            DataCheckWarning(
                message=
                "Column(s) '3', '25', '55', '72' are likely to have outlier data.",
                data_check_name=outliers_data_check_name,
                message_code=DataCheckMessageCode.HAS_OUTLIERS,
                details={
                    "columns": [3, 25, 55, 72]
                }).to_dict()
        ],
        "errors": []
    }
    def validate(self, X, y):
        """Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation.

        If `method='mutual'`, supports all target and feature types. Otherwise, if `method='pearson'` only supports binary with numeric and boolean dtypes.
        Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1].

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check
            y (ww.DataColumn, pd.Series, np.ndarray): The target data

        Returns:
            dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected.

        Example:
            >>> import pandas as pd
            >>> X = pd.DataFrame({
            ...    'leak': [10, 42, 31, 51, 61],
            ...    'x': [42, 54, 12, 64, 12],
            ...    'y': [13, 5, 13, 74, 24],
            ... })
            >>> y = pd.Series([10, 42, 31, 51, 40])
            >>> target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.95)
            >>> assert target_leakage_check.validate(X, y) == {"warnings": [{"message": "Column 'leak' is 95.0% or more correlated with the target",\
                                                                             "data_check_name": "TargetLeakageDataCheck",\
                                                                             "level": "warning",\
                                                                             "code": "TARGET_LEAKAGE",\
                                                                             "details": {"column": "leak"}}],\
                                                               "errors": [],\
                                                               "actions": [{"code": "DROP_COL",\
                                                                            "metadata": {"column": "leak"}}]}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        X = infer_feature_types(X)
        y = infer_feature_types(y)

        if self.method == 'pearson':
            highly_corr_cols = self._calculate_pearson(X, y)
        else:
            X = _convert_woodwork_types_wrapper(X.to_dataframe())
            y = _convert_woodwork_types_wrapper(y.to_series())
            highly_corr_cols = self._calculate_mutual_information(X, y)

        warning_msg = "Column '{}' is {}% or more correlated with the target"
        results["warnings"].extend([
            DataCheckWarning(message=warning_msg.format(
                col_name, self.pct_corr_threshold * 100),
                             data_check_name=self.name,
                             message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                             details={
                                 "column": col_name
                             }).to_dict() for col_name in highly_corr_cols
        ])
        results["actions"].extend([
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": col_name
                            }).to_dict() for col_name in highly_corr_cols
        ])
        return results
示例#5
0
def test_multicollinearity_nonnumeric_cols(data_type, make_data_type):
    X = pd.DataFrame({
        'col_1': ["a", "b", "c", "d", "a"],
        'col_2': ["w", "x", "y", "z", "b"],
        'col_3': ["a", "a", "c", "d", "a"],
        'col_4': ["a", "b", "c", "d", "a"],
        'col_5': ["0", "0", "1", "2", "0"],
        'col_6': [1, 1, 2, 3, 1]
    })
    X = make_data_type(data_type, X)
    multi_check = MulticollinearityDataCheck(threshold=0.9)
    assert multi_check.validate(X) == {
        "warnings": [
            DataCheckWarning(
                message=
                "Columns are likely to be correlated: [('col_1', 'col_4'), ('col_3', 'col_5'), ('col_3', 'col_6'), ('col_5', 'col_6'), ('col_1', 'col_2'), ('col_2', 'col_4')]",
                data_check_name=multi_data_check_name,
                message_code=DataCheckMessageCode.IS_MULTICOLLINEAR,
                details={
                    'columns': [('col_1', 'col_4'), ('col_3', 'col_5'),
                                ('col_3', 'col_6'), ('col_5', 'col_6'),
                                ('col_1', 'col_2'), ('col_2', 'col_4')]
                }).to_dict()
        ],
        "errors": []
    }
    def validate(self, X, y=None):
        """Check if any set of features are likely to be multicollinear.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check

        Returns:
            dict: dict with a DataCheckWarning if there are any potentially multicollinear columns.

        """
        messages = {"warnings": [], "errors": []}

        X = infer_feature_types(X)
        mutual_info_df = X.mutual_information()
        if mutual_info_df.empty:
            return messages
        above_threshold = mutual_info_df.loc[
            mutual_info_df['mutual_info'] >= self.threshold]
        correlated_cols = [(col_1, col_2) for col_1, col_2 in zip(
            above_threshold['column_1'], above_threshold['column_2'])]
        if correlated_cols:
            warning_msg = "Columns are likely to be correlated: {}"
            messages["warnings"].append(
                DataCheckWarning(
                    message=warning_msg.format(correlated_cols),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.IS_MULTICOLLINEAR,
                    details={
                        "columns": correlated_cols
                    }).to_dict())
        return messages
示例#7
0
def test_default_data_checks_regression(input_type):
    X = pd.DataFrame({
        'lots_of_null': [None, None, None, None, "some data"],
        'all_null': [None, None, None, None, None],
        'also_all_null': [None, None, None, None, None],
        'no_null': [1, 2, 3, 5, 5],
        'id': [0, 1, 2, 3, 4],
        'has_label_leakage': [100, 200, 100, 200, 100]
    })
    y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2])
    y_no_variance = pd.Series([5] * 5)

    if input_type == "ww":
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
        y_no_variance = ww.DataColumn(y_no_variance)
    null_leakage = [
        DataCheckWarning(
            message=
            "Column 'lots_of_null' is 95.0% or more correlated with the target",
            data_check_name="TargetLeakageDataCheck",
            message_code=DataCheckMessageCode.TARGET_LEAKAGE,
            details={
                "column": "lots_of_null"
            }).to_dict()
    ]
    data_checks = DefaultDataChecks(
        "regression", get_default_primary_search_objective("regression"))
    assert data_checks.validate(X, y) == {
        "warnings": messages[:3],
        "errors": messages[3:]
    }

    # Skip Invalid Target
    assert data_checks.validate(X, y_no_variance) == {
        "warnings":
        messages[:3] + null_leakage,
        "errors":
        messages[4:] + [
            DataCheckError(message="Y has 1 unique value.",
                           data_check_name="NoVarianceDataCheck",
                           message_code=DataCheckMessageCode.NO_VARIANCE,
                           details={
                               "column": "Y"
                           }).to_dict()
        ]
    }

    data_checks = DataChecks(
        DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {
            "InvalidTargetDataCheck": {
                "problem_type": "regression",
                "objective": get_default_primary_search_objective("regression")
            }
        })
    assert data_checks.validate(X, y) == {
        "warnings": messages[:3],
        "errors": messages[3:]
    }
def test_sparsity_data_check_warnings():
    data = pd.DataFrame({
        'most_sparse': [float(x) for x in range(10)],  # [0,1,2,3,4,5,6,7,8,9]
        'more_sparse': [x % 5 for x in range(10)],  # [0,1,2,3,4,0,1,2,3,4]
        'sparse': [x % 3 for x in range(10)],  # [0,1,2,0,1,2,0,1,2,0]
        'less_sparse': [x % 2 for x in range(10)],  # [0,1,0,1,0,1,0,1,0,1]
        'not_sparse': [float(1) for x in range(10)]
    })  # [1,1,1,1,1,1,1,1,1,1]

    sparsity_check = SparsityDataCheck(problem_type="multiclass",
                                       threshold=.4,
                                       unique_count_threshold=3)

    assert sparsity_check.validate(data) == {
        "warnings": [
            DataCheckWarning(
                message=
                "Input columns (most_sparse) for multiclass problem type are too sparse.",
                data_check_name=sparsity_data_check_name,
                message_code=DataCheckMessageCode.TOO_SPARSE,
                details={
                    "column": "most_sparse",
                    'sparsity_score': 0
                }).to_dict(),
            DataCheckWarning(
                message=
                "Input columns (more_sparse) for multiclass problem type are too sparse.",
                data_check_name=sparsity_data_check_name,
                message_code=DataCheckMessageCode.TOO_SPARSE,
                details={
                    "column": "more_sparse",
                    'sparsity_score': 0
                }).to_dict(),
            DataCheckWarning(
                message=
                "Input columns (sparse) for multiclass problem type are too sparse.",
                data_check_name=sparsity_data_check_name,
                message_code=DataCheckMessageCode.TOO_SPARSE,
                details={
                    "column": "sparse",
                    'sparsity_score': 0.3333333333333333
                }).to_dict()
        ],
        "errors": [],
        "actions": []
    }
示例#9
0
    def validate(self, X, y):
        """Checks if any target labels are imbalanced beyond a threshold for binary and multiclass problems
            Ignores NaN values in target labels if they appear.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored.
            y (ww.DataColumn, pd.Series, np.ndarray): Target labels to check for imbalanced data.

        Returns:
            dict: Dictionary with DataCheckWarnings if imbalance in classes is less than the threshold,
                  and DataCheckErrors if the number of values for each target is below 2 * num_cv_folds.

        Example:
            >>> import pandas as pd
            >>> X = pd.DataFrame()
            >>> y = pd.Series([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
            >>> target_check = ClassImbalanceDataCheck(threshold=0.10)
            >>> assert target_check.validate(X, y) == {"errors": [{"message": "The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0]",\
                                                                   "data_check_name": "ClassImbalanceDataCheck",\
                                                                   "level": "error",\
                                                                   "code": "CLASS_IMBALANCE_BELOW_FOLDS",\
                                                                   "details": {"target_values": [0]}}],\
                                                     "warnings": [{"message": "The following labels fall below 10% of the target: [0]",\
                                                                   "data_check_name": "ClassImbalanceDataCheck",\
                                                                   "level": "warning",\
                                                                   "code": "CLASS_IMBALANCE_BELOW_THRESHOLD",\
                                                                   "details": {"target_values": [0]}}]}
        """
        messages = {
            "warnings": [],
            "errors": []
        }

        y = infer_feature_types(y)
        y = _convert_woodwork_types_wrapper(y.to_series())

        fold_counts = y.value_counts(normalize=False)
        # search for targets that occur less than twice the number of cv folds first
        below_threshold_folds = fold_counts.where(fold_counts < self.cv_folds).dropna()
        if len(below_threshold_folds):
            below_threshold_values = below_threshold_folds.index.tolist()
            error_msg = "The number of instances of these targets is less than 2 * the number of cross folds = {} instances: {}"
            DataCheck._add_message(DataCheckError(message=error_msg.format(self.cv_folds, below_threshold_values),
                                                  data_check_name=self.name,
                                                  message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS,
                                                  details={"target_values": below_threshold_values}), messages)

        counts = fold_counts / fold_counts.sum()
        below_threshold = counts.where(counts < self.threshold).dropna()
        # if there are items that occur less than the threshold, add them to the list of messages
        if len(below_threshold):
            below_threshold_values = below_threshold.index.tolist()
            warning_msg = "The following labels fall below {:.0f}% of the target: {}"
            DataCheck._add_message(DataCheckWarning(message=warning_msg.format(self.threshold * 100, below_threshold_values),
                                                    data_check_name=self.name,
                                                    message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD,
                                                    details={"target_values": below_threshold_values}), messages)
        return messages
示例#10
0
def test_default_data_checks_null_rows():
    class SeriesWrap():
        def __init__(self, series):
            self.series = series

        def __eq__(self, series_2):
            return all(self.series.eq(series_2.series))

    X = pd.DataFrame({'all_null': [None, None, None, None, None],
                      'also_all_null': [None, None, None, None, None]})
    y = pd.Series([0, 1, np.nan, 1, 0])
    data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression"))
    highly_null_rows = SeriesWrap(pd.Series([1.0, 1.0, 1.0, 1.0, 1.0]))
    expected = {
        "warnings": [DataCheckWarning(message="5 out of 5 rows are more than 95.0% null",
                                      data_check_name="HighlyNullDataCheck",
                                      message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS,
                                      details={"pct_null_cols": highly_null_rows}).to_dict(),
                     DataCheckWarning(message="Column 'all_null' is 95.0% or more null",
                                      data_check_name="HighlyNullDataCheck",
                                      message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
                                      details={"column": 'all_null', "pct_null_rows": 1.0}).to_dict(),
                     DataCheckWarning(message="Column 'also_all_null' is 95.0% or more null",
                                      data_check_name="HighlyNullDataCheck",
                                      message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
                                      details={"column": 'also_all_null', "pct_null_rows": 1.0}).to_dict()],
        "errors": [DataCheckError(message="1 row(s) (20.0%) of target values are null",
                                  data_check_name="InvalidTargetDataCheck",
                                  message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                                  details={"num_null_rows": 1, "pct_null_rows": 20.0}).to_dict(),
                   DataCheckError(message="all_null has 0 unique value.",
                                  data_check_name="NoVarianceDataCheck",
                                  message_code=DataCheckMessageCode.NO_VARIANCE,
                                  details={"column": "all_null"}).to_dict(),
                   DataCheckError(message="also_all_null has 0 unique value.",
                                  data_check_name="NoVarianceDataCheck",
                                  message_code=DataCheckMessageCode.NO_VARIANCE,
                                  details={"column": "also_all_null"}).to_dict()],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3, 4]}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'also_all_null'}).to_dict(),
                    DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "mean"}).to_dict()]}
    validation_results = data_checks.validate(X, y)
    validation_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validation_results['warnings'][0]['details']['pct_null_cols'])
    assert validation_results == expected
def test_uniqueness_data_check_warnings():
    data = pd.DataFrame({
        'regression_unique_enough': [float(x) for x in range(100)],
        'regression_not_unique_enough': [float(1) for x in range(100)]
    })

    uniqueness_check = UniquenessDataCheck(problem_type="regression")
    assert uniqueness_check.validate(data) == {
        "warnings": [
            DataCheckWarning(
                message=
                "Input columns (regression_not_unique_enough) for regression problem type are not unique enough.",
                data_check_name=uniqueness_data_check_name,
                message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH,
                details={
                    "column": "regression_not_unique_enough",
                    'uniqueness_score': 0.0
                }).to_dict()
        ],
        "errors": [],
        "actions": []
    }

    data = pd.DataFrame({
        'multiclass_too_unique':
        ["Cats", "Are", "Absolutely", "The", "Best"] * 20,
        'multiclass_not_too_unique':
        ["Cats", "Cats", "Best", "Best", "Best"] * 20
    })
    uniqueness_check = UniquenessDataCheck(problem_type="multiclass")
    assert uniqueness_check.validate(data) == {
        "warnings": [
            DataCheckWarning(
                message=
                "Input columns (multiclass_too_unique) for multiclass problem type are too unique.",
                data_check_name=uniqueness_data_check_name,
                message_code=DataCheckMessageCode.TOO_UNIQUE,
                details={
                    "column": "multiclass_too_unique",
                    'uniqueness_score': 0.7999999999999999
                }).to_dict()
        ],
        "errors": [],
        "actions": []
    }
def test_invalid_target_data_check_multiclass_problem_almostcontinuous_data():
    invalid_targets_check = InvalidTargetDataCheck(
        "multiclass", get_default_primary_search_objective("multiclass"))
    y_multiclass_high_classes = pd.Series(
        list(range(0, 100)) *
        3)  # 100 classes, 300 samples, .33 class/sample ratio
    X = pd.DataFrame({"col": range(len(y_multiclass_high_classes))})
    data_check_error = DataCheckWarning(
        message=
        f"Target has a large number of unique values, could be regression type problem.",
        data_check_name=invalid_targets_data_check_name,
        message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS,
        details={
            "class_to_value_ratio": 1 / 3
        }).to_dict()
    assert invalid_targets_check.validate(X, y=y_multiclass_high_classes) == {
        "warnings": [data_check_error],
        "errors": []
    }

    y_multiclass_med_classes = pd.Series(
        list(range(0, 5)) *
        20)  # 5 classes, 100 samples, .05 class/sample ratio
    X = pd.DataFrame({"col": range(len(y_multiclass_med_classes))})
    data_check_error = DataCheckWarning(
        message=
        f"Target has a large number of unique values, could be regression type problem.",
        data_check_name=invalid_targets_data_check_name,
        message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS,
        details={
            "class_to_value_ratio": .05
        }).to_dict()
    assert invalid_targets_check.validate(X, y=y_multiclass_med_classes) == {
        "warnings": [data_check_error],
        "errors": []
    }

    y_multiclass_low_classes = pd.Series(
        list(range(0, 3)) *
        100)  # 2 classes, 300 samples, .01 class/sample ratio
    X = pd.DataFrame({"col": range(len(y_multiclass_low_classes))})
    assert invalid_targets_check.validate(X, y=y_multiclass_low_classes) == {
        "warnings": [],
        "errors": []
    }
示例#13
0
    def validate(self, X, y=None):
        """Check if any of the features are likely to be ID columns. Currently performs these simple checks:

            - column name is "id"
            - column name ends in "_id"
            - column contains all unique values (and is categorical / integer type)

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check

        Returns:
            dict: A dictionary of features with column name or index and their probability of being ID columns

        Example:
            >>> import pandas as pd
            >>> df = pd.DataFrame({
            ...     'df_id': [0, 1, 2, 3, 4],
            ...     'x': [10, 42, 31, 51, 61],
            ...     'y': [42, 54, 12, 64, 12]
            ... })
            >>> id_col_check = IDColumnsDataCheck()
            >>> assert id_col_check.validate(df) == {"errors": [],\
                                                     "warnings": [{"message": "Column 'df_id' is 100.0% or more likely to be an ID column",\
                                                                   "data_check_name": "IDColumnsDataCheck",\
                                                                   "level": "warning",\
                                                                   "code": "HAS_ID_COLUMN",\
                                                                   "details": {"column": "df_id"}}]}
        """
        messages = {
            "warnings": [],
            "errors": []
        }

        X = _convert_to_woodwork_structure(X)

        col_names = [col for col in X.columns]
        cols_named_id = [col for col in col_names if (str(col).lower() == "id")]  # columns whose name is "id"
        id_cols = {col: 0.95 for col in cols_named_id}

        X = X.select(include=['Integer', 'Categorical'])
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        check_all_unique = (X.nunique() == len(X))
        cols_with_all_unique = check_all_unique[check_all_unique].index.tolist()  # columns whose values are all unique
        id_cols.update([(col, 1.0) if col in id_cols else (col, 0.95) for col in cols_with_all_unique])

        col_ends_with_id = [col for col in col_names if str(col).lower().endswith("_id")]  # columns whose name ends with "_id"
        id_cols.update([(col, 1.0) if str(col) in id_cols else (col, 0.95) for col in col_ends_with_id])

        id_cols_above_threshold = {key: value for key, value in id_cols.items() if value >= self.id_threshold}
        warning_msg = "Column '{}' is {}% or more likely to be an ID column"
        messages["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name, self.id_threshold * 100),
                                                      data_check_name=self.name,
                                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                                      details={"column": col_name}).to_dict()
                                     for col_name in id_cols_above_threshold])
        return messages
示例#14
0
 def validate(self, X, y):
     return {
         "warnings": [
             DataCheckWarning(message="warning one",
                              data_check_name=self.name,
                              message_code=None).to_dict()
         ],
         "errors": []
     }
def test_invalid_target_data_check_mismatched_indices():
    X = pd.DataFrame({"col": [1, 2, 3]})
    y_same_index = pd.Series([1, 0, 1])
    y_diff_index = pd.Series([0, 1, 0], index=[1, 5, 10])
    y_diff_index_order = pd.Series([0, 1, 0], index=[0, 2, 1])

    invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary"))
    assert invalid_targets_check.validate(X=None, y=y_same_index) == {"warnings": [], "errors": [], "actions": []}
    assert invalid_targets_check.validate(X, y_same_index) == {"warnings": [], "errors": [], "actions": []}

    X_index_missing = list(set(y_diff_index.index) - set(X.index))
    y_index_missing = list(set(X.index) - set(y_diff_index.index))
    assert invalid_targets_check.validate(X, y_diff_index) == {
        "warnings": [DataCheckWarning(message="Input target and features have mismatched indices",
                                      data_check_name=invalid_targets_data_check_name,
                                      message_code=DataCheckMessageCode.MISMATCHED_INDICES,
                                      details={"indices_not_in_features": X_index_missing,
                                               "indices_not_in_target": y_index_missing}).to_dict()],
        "errors": [],
        "actions": []
    }
    assert invalid_targets_check.validate(X, y_diff_index_order) == {
        "warnings": [DataCheckWarning(message="Input target and features have mismatched indices order",
                                      data_check_name=invalid_targets_data_check_name,
                                      message_code=DataCheckMessageCode.MISMATCHED_INDICES_ORDER,
                                      details={}).to_dict()],
        "errors": [],
        "actions": []
    }

    # Test that we only store ten mismatches when there are more than 10 differences in indices found
    X_large = pd.DataFrame({"col": range(20)})
    y_more_than_ten_diff_indices = pd.Series([0, 1] * 10, index=range(20, 40))
    X_index_missing = list(set(y_more_than_ten_diff_indices.index) - set(X.index))
    y_index_missing = list(set(X_large.index) - set(y_more_than_ten_diff_indices.index))
    assert invalid_targets_check.validate(X_large, y_more_than_ten_diff_indices) == {
        "warnings": [DataCheckWarning(message="Input target and features have mismatched indices",
                                      data_check_name=invalid_targets_data_check_name,
                                      message_code=DataCheckMessageCode.MISMATCHED_INDICES,
                                      details={"indices_not_in_features": X_index_missing[:10],
                                               "indices_not_in_target": y_index_missing[:10]}).to_dict()],
        "errors": [],
        "actions": []
    }
def test_highly_null_data_check_warnings():
    data = pd.DataFrame({'lots_of_null': [None, None, None, None, 5],
                         'all_null': [None, None, None, None, None],
                         'no_null': [1, 2, 3, 4, 5]})
    no_null_check = HighlyNullDataCheck(pct_null_threshold=0.0)
    assert no_null_check.validate(data) == {
        "warnings": [DataCheckWarning(message="Column 'lots_of_null' is more than 0% null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": "lots_of_null"}).to_dict(),
                     DataCheckWarning(message="Column 'all_null' is more than 0% null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": "all_null"}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()]
    }

    some_null_check = HighlyNullDataCheck(pct_null_threshold=0.5)
    assert some_null_check.validate(data) == {
        "warnings": [DataCheckWarning(message="Column 'lots_of_null' is 50.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": "lots_of_null"}).to_dict(),
                     DataCheckWarning(message="Column 'all_null' is 50.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": "all_null"}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()]

    }

    all_null_check = HighlyNullDataCheck(pct_null_threshold=1.0)
    assert all_null_check.validate(data) == {
        "warnings": [DataCheckWarning(message="Column 'all_null' is 100.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": "all_null"}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()]
    }
示例#17
0
def test_class_imbalance_severe(min_samples, input_type):
    X = pd.DataFrame()
    # 0 will be < 10% of the data, but there will be 50 samples of it
    y_values_binary = pd.Series([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] * 50)
    y_values_multiclass = pd.Series([0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2] *
                                    50)
    if input_type == "ww":
        X = ww.DataTable(X)
        y_values_binary = ww.DataColumn(y_values_binary)
        y_values_multiclass = ww.DataColumn(y_values_multiclass)

    class_imbalance_check = ClassImbalanceDataCheck(min_samples=min_samples,
                                                    num_cv_folds=1)
    warnings = [
        DataCheckWarning(
            message="The following labels fall below 10% of the target: [0]",
            data_check_name=class_imbalance_data_check_name,
            message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD,
            details={
                "target_values": [0]
            }).to_dict()
    ]
    if min_samples > 50:
        warnings.append(
            DataCheckWarning(
                message=
                f"The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than {min_samples} samples: [0]",
                data_check_name=class_imbalance_data_check_name,
                message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE,
                details={
                    "target_values": [0]
                }).to_dict())
    assert class_imbalance_check.validate(X, y_values_binary) == {
        "warnings": warnings,
        "errors": [],
        "actions": []
    }

    assert class_imbalance_check.validate(X, y_values_multiclass) == {
        "warnings": warnings,
        "errors": [],
        "actions": []
    }
示例#18
0
    def validate(self, X, y=None):
        """Checks if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features
            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.

        Returns:
            dict: A dictionary with warnings if any columns have outliers.

        Example:
            >>> df = pd.DataFrame({
            ...     'x': [1, 2, 3, 4, 5],
            ...     'y': [6, 7, 8, 9, 10],
            ...     'z': [-1, -2, -3, -1201, -4]
            ... })
            >>> outliers_check = OutliersDataCheck()
            >>> assert outliers_check.validate(df) == {"warnings": [{"message": "Column(s) 'z' are likely to have outlier data.",\
                                                                     "data_check_name": "OutliersDataCheck",\
                                                                     "level": "warning",\
                                                                     "code": "HAS_OUTLIERS",\
                                                                     "details": {"columns": ["z"]}}],\
                                                       "errors": []}
        """
        messages = {"warnings": [], "errors": []}

        X = infer_feature_types(X)
        X = X.select('numeric')
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        if len(X.columns) == 0:
            return messages

        def get_IQR(df, k=2.0):
            q1 = df.quantile(0.25)
            q3 = df.quantile(0.75)
            iqr = q3 - q1
            lower_bound = pd.Series(q1 - (k * iqr), name='lower_bound')
            upper_bound = pd.Series(q3 + (k * iqr), name='upper_bound')
            return pd.concat([lower_bound, upper_bound], axis=1)

        iqr = get_IQR(X, k=2.0)
        has_outliers = ((X < iqr['lower_bound']) |
                        (X > iqr['upper_bound'])).any()
        cols = list(has_outliers.index[has_outliers])
        warning_msg = "Column(s) {} are likely to have outlier data.".format(
            ", ".join([f"'{col}'" for col in cols]))
        messages["warnings"].append(
            DataCheckWarning(message=warning_msg,
                             data_check_name=self.name,
                             message_code=DataCheckMessageCode.HAS_OUTLIERS,
                             details={
                                 "columns": cols
                             }).to_dict())
        return messages
示例#19
0
    def validate(self, X, y=None):
        """Checks if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features
            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.

        Returns:
            dict: A dictionary with warnings if any columns have outliers.

        Example:
            >>> import pandas as pd
            >>> df = pd.DataFrame({
            ...     'x': [1, 2, 3, 4, 5],
            ...     'y': [6, 7, 8, 9, 10],
            ...     'z': [-1, -2, -3, -1201, -4]
            ... })
            >>> outliers_check = OutliersDataCheck()
            >>> assert outliers_check.validate(df) == {"warnings": [{"message": "Column(s) 'z' are likely to have outlier data.",\
                                                                     "data_check_name": "OutliersDataCheck",\
                                                                     "level": "warning",\
                                                                     "code": "HAS_OUTLIERS",\
                                                                     "details": {"columns": ["z"]}}],\
                                                       "errors": [],\
                                                       "actions": []}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        X = infer_feature_types(X)
        X = X.select('numeric')
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        if len(X.columns) == 0:
            return results

        has_outliers = []
        for col in X.columns:
            outlier_results = OutliersDataCheck._outlier_score(X[col], False)
            if outlier_results is not None and outlier_results[
                    "score"] <= 0.9:  # 0.9 is threshold indicating data needs improvement
                has_outliers.append(col)
        warning_msg = "Column(s) {} are likely to have outlier data.".format(
            ", ".join([f"'{col}'" for col in has_outliers]))
        results["warnings"].append(
            DataCheckWarning(message=warning_msg,
                             data_check_name=self.name,
                             message_code=DataCheckMessageCode.HAS_OUTLIERS,
                             details={
                                 "columns": has_outliers
                             }).to_dict())
        return results
示例#20
0
def test_data_check_message_to_dict():
    error = DataCheckError(message="test message",
                           data_check_name="same test name",
                           message_code=DataCheckMessageCode.HIGHLY_NULL,
                           details={"detail 1": "error info"})
    assert error.to_dict() == {
        "message": "test message",
        "level": "error",
        "data_check_name": "same test name",
        "code": DataCheckMessageCode.HIGHLY_NULL.name,
        "details": {"detail 1": "error info"}
    }
    warning = DataCheckWarning(message="test message",
                               data_check_name="same test name",
                               message_code=DataCheckMessageCode.HIGHLY_NULL,
                               details={"detail 1": "warning info"})
    assert warning.to_dict() == {
        "message": "test message",
        "level": "warning",
        "data_check_name": "same test name",
        "code": DataCheckMessageCode.HIGHLY_NULL.name,
        "details": {"detail 1": "warning info"}
    }
示例#21
0
    def validate(self, X, y=None):
        """Calculates what percentage of each column's unique values exceed the count threshold and compare
        that percentage to the sparsity threshold stored in the class instance.
        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features.
            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.
        Returns:
            dict: dict with a DataCheckWarning if there are any sparse columns.
        Example:
            >>> import pandas as pd
            >>> df = pd.DataFrame({
            ...    'sparse': [float(x) for x in range(100)],
            ...    'not_sparse': [float(1) for x in range(100)]
            ... })
            >>> sparsity_check = SparsityDataCheck(problem_type="multiclass", threshold=0.5, unique_count_threshold=10)
            >>> assert sparsity_check.validate(df) == {"errors": [],\
                                                       "warnings": [{"message": "Input columns (sparse) for multiclass problem type are too sparse.",\
                                                            "data_check_name": "SparsityDataCheck",\
                                                            "level": "warning",\
                                                            "code": "TOO_SPARSE",\
                                                            "details": {"column": "sparse", 'sparsity_score': 0.0}}],\
                                                       "actions": [{"code": "DROP_COL",\
                                                                 "metadata": {"column": "sparse"}}]}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        X = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        res = X.apply(SparsityDataCheck.sparsity_score,
                      count_threshold=self.unique_count_threshold)
        too_sparse_cols = [col for col in res.index[res < self.threshold]]
        results["warnings"].extend([
            DataCheckWarning(message=warning_too_unique.format(
                col_name, self.problem_type),
                             data_check_name=self.name,
                             message_code=DataCheckMessageCode.TOO_SPARSE,
                             details={
                                 "column": col_name,
                                 "sparsity_score": res.loc[col_name]
                             }).to_dict() for col_name in too_sparse_cols
        ])
        results["actions"].extend([
            DataCheckAction(action_code=DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": col_name
                            }).to_dict() for col_name in too_sparse_cols
        ])
        return results
def test_invalid_target_data_check_numeric_binary_classification_error():
    y = pd.Series([1, 5, 1, 5, 1, 1])
    X = pd.DataFrame({"col": range(len(y))})
    invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary"))
    assert invalid_targets_check.validate(X, y) == {
        "warnings": [DataCheckWarning(
            message="Numerical binary classification target classes must be [0, 1], got [1, 5] instead",
            data_check_name=invalid_targets_data_check_name,
            message_code=DataCheckMessageCode.TARGET_BINARY_INVALID_VALUES,
            details={"target_values": [1, 5]}).to_dict()],
        "errors": []
    }

    y = pd.Series([0, 5, np.nan, np.nan])
    X = pd.DataFrame({"col": range(len(y))})
    assert invalid_targets_check.validate(X, y) == {
        "warnings": [DataCheckWarning(
            message="Numerical binary classification target classes must be [0, 1], got [5.0, 0.0] instead",
            data_check_name=invalid_targets_data_check_name,
            message_code=DataCheckMessageCode.TARGET_BINARY_INVALID_VALUES,
            details={"target_values": [5.0, 0.0]}).to_dict()],
        "errors": [DataCheckError(message="2 row(s) (50.0%) of target values are null",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                                  details={"num_null_rows": 2, "pct_null_rows": 50}).to_dict()]
    }

    y = pd.Series([0, 1, 1, 0, 1, 2])
    X = pd.DataFrame({"col": range(len(y))})
    assert invalid_targets_check.validate(X, y) == {
        "warnings": [],
        "errors": [DataCheckError(message="Binary class targets require exactly two unique values.",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                                  details={"target_values": [1, 0, 2]}).to_dict()]
    }
def test_outliers_data_check_string_cols():
    a = np.arange(10) * 0.01
    data = np.tile(a, (100, 2))
    n_cols = 20

    X = pd.DataFrame(data=data, columns=[string.ascii_lowercase[i] for i in range(n_cols)])
    X.iloc[0, 3] = 1000

    outliers_check = OutliersDataCheck()
    assert outliers_check.validate(X) == {
        "warnings": [DataCheckWarning(message="Column(s) 'd' are likely to have outlier data.",
                                      data_check_name=outliers_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_OUTLIERS,
                                      details={"columns": ["d"]}).to_dict()],
        "errors": []
    }
def test_data_check_message_attributes_optional():
    data_check_warning = DataCheckWarning(
        message="test warning", data_check_name="test data check warning name")
    assert data_check_warning.message == "test warning"
    assert data_check_warning.data_check_name == "test data check warning name"
    assert data_check_warning.message_type == DataCheckMessageType.WARNING
    assert data_check_warning.message_code is None
    assert data_check_warning.details is None

    data_check_error = DataCheckError(
        message="test error", data_check_name="test data check error name")
    assert data_check_error.message == "test error"
    assert data_check_error.data_check_name == "test data check error name"
    assert data_check_error.message_type == DataCheckMessageType.ERROR
    assert data_check_error.message_code is None
    assert data_check_error.details is None
示例#25
0
    def validate(self, pipeline_name, cv_scores):
        """Checks cross-validation scores and issues an warning if variance is higher than specified threshhold.

        Arguments:
            pipeline_name (str): name of pipeline that produced cv_scores
            cv_scores (pd.Series, np.ndarray, list): list of scores of each cross-validation fold

        Returns:
            dict: Dictionary with DataCheckWarnings if imbalance in classes is less than the threshold.

        Example:
            >>> cv_scores = pd.Series([0, 1, 1, 1])
            >>> check = HighVarianceCVDataCheck(threshold=0.10)
            >>> assert check.validate("LogisticRegressionPipeline", cv_scores) == {"warnings": [{"message": "High coefficient of variation (cv >= 0.1) within cross validation scores. LogisticRegressionPipeline may not perform as estimated on unseen data.",\
                                                                                                 "data_check_name": "HighVarianceCVDataCheck",\
                                                                                                 "level": "warning",\
                                                                                                 "code": "HIGH_VARIANCE",\
                                                                                                 "details": {"variance": 2.0/3.0, "pipeline_name": "LogisticRegressionPipeline"}}],\
                                                                                   "errors": []}
        """
        messages = {"warnings": [], "errors": []}
        if not isinstance(cv_scores, pd.Series):
            cv_scores = pd.Series(cv_scores)

        variance = 0
        if cv_scores.mean() == 0:
            high_variance_cv = False
        else:
            variance = abs(cv_scores.std() / cv_scores.mean())
            high_variance_cv = abs(
                cv_scores.std() / cv_scores.mean()) > self.threshold
        # if there are items that occur less than the threshold, add them to the list of messages
        if high_variance_cv:
            warning_msg = f"High coefficient of variation (cv >= {self.threshold}) within cross validation scores. {pipeline_name} may not perform as estimated on unseen data."
            DataCheck._add_message(
                DataCheckWarning(
                    message=warning_msg,
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.HIGH_VARIANCE,
                    details={
                        "variance": variance,
                        "pipeline_name": pipeline_name
                    }), messages)
        return messages
def test_id_cols_data_check_input_formats():
    id_cols_check = IDColumnsDataCheck(id_threshold=0.8)

    # test empty pd.DataFrame
    assert id_cols_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []}

    #  test Woodwork
    ww_input = ww.DataTable(np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]]))
    assert id_cols_check.validate(ww_input) == {
        "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": 0}).to_dict(),
                     DataCheckWarning(message="Column '1' is 80.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": 1}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 0}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 1}).to_dict()]
    }

    #  test 2D list
    assert id_cols_check.validate([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]]) == {
        "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": 0}).to_dict(),
                     DataCheckWarning("Column '1' is 80.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": 1}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 0}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 1}).to_dict()]
    }

    # test np.array
    assert id_cols_check.validate(np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]])) == {
        "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": 0}).to_dict(),
                     DataCheckWarning(message="Column '1' is 80.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": 1}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 0}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 1}).to_dict()]
    }
def test_high_variance_cv_data_check_negative():
    high_variance_cv = HighVarianceCVDataCheck()
    cv_scores = pd.Series([0, -1, -1, -1])
    variance = abs(cv_scores.std() / cv_scores.mean())
    assert high_variance_cv.validate(
        pipeline_name=hv_pipeline_name, cv_scores=cv_scores
    ) == {
        "warnings": [
            DataCheckWarning(
                message=
                "High coefficient of variation (cv >= 0.2) within cross validation scores. LogisticRegressionPipeline may not perform as estimated on unseen data.",
                data_check_name=high_variance_data_check_name,
                message_code=DataCheckMessageCode.HIGH_VARIANCE,
                details={
                    "variance": variance,
                    "pipeline_name": hv_pipeline_name
                }).to_dict()
        ],
        "errors": []
    }
def test_id_columns_warning():
    X_dict = {'col_1_id': [0, 1, 2, 3],
              'col_2': [2, 3, 4, 5],
              'col_3_id': [1, 1, 2, 3],
              'Id': [3, 1, 2, 0],
              'col_5': [0, 0, 1, 2],
              'col_6': [0.1, 0.2, 0.3, 0.4]
              }
    X = pd.DataFrame.from_dict(X_dict)
    id_cols_check = IDColumnsDataCheck(id_threshold=0.95)
    assert id_cols_check.validate(X) == {
        "warnings": [DataCheckWarning(message="Column 'Id' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "Id"}).to_dict(),
                     DataCheckWarning(message="Column 'col_1_id' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_1_id"}).to_dict(),
                     DataCheckWarning(message="Column 'col_2' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_2"}).to_dict(),
                     DataCheckWarning(message="Column 'col_3_id' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_3_id"}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_2"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_3_id"}).to_dict()]
    }

    X = pd.DataFrame.from_dict(X_dict)
    id_cols_check = IDColumnsDataCheck(id_threshold=1.0)
    assert id_cols_check.validate(X) == {
        "warnings": [DataCheckWarning(message="Column 'Id' is 100.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "Id"}).to_dict(),
                     DataCheckWarning(message="Column 'col_1_id' is 100.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_1_id"}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict()]
    }
def test_id_columns_strings():
    X_dict = {'col_1_id': ["a", "b", "c", "d"],
              'col_2': ["w", "x", "y", "z"],
              'col_3_id': ["123456789012345", "234567890123456", "3456789012345678", "45678901234567"],
              'Id': ["z", "y", "x", "a"],
              'col_5': ["0", "0", "1", "2"],
              'col_6': [0.1, 0.2, 0.3, 0.4]
              }
    X = pd.DataFrame.from_dict(X_dict)
    id_cols_check = IDColumnsDataCheck(id_threshold=0.95)
    assert id_cols_check.validate(X) == {
        "warnings": [DataCheckWarning(message="Column 'Id' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "Id"}).to_dict(),
                     DataCheckWarning(message="Column 'col_1_id' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_1_id"}).to_dict(),
                     DataCheckWarning(message="Column 'col_2' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_2"}).to_dict(),
                     DataCheckWarning(message="Column 'col_3_id' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_3_id"}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_2"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_3_id"}).to_dict()]
    }

    id_cols_check = IDColumnsDataCheck(id_threshold=1.0)
    assert id_cols_check.validate(X) == {
        "warnings": [DataCheckWarning(message="Column 'Id' is 100.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "Id"}).to_dict(),
                     DataCheckWarning(message="Column 'col_1_id' is 100.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_1_id"}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict()]
    }
     "warnings": [],
     "errors": [labels_1_unique],
     "actions": []
 }),
 (all_distinct_X, all_null_y, False, {
     "warnings": [],
     "errors": [labels_0_unique],
     "actions": []
 }),
 (two_distinct_with_nulls_X, two_distinct_with_nulls_y, True, {
     "warnings": [
         DataCheckWarning(
             message=
             "feature has two unique values including nulls. Consider encoding the nulls for "
             "this column to be useful for machine learning.",
             data_check_name=no_variance_data_check_name,
             message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL,
             details={
                 "column": "feature"
             }).to_dict(),
         DataCheckWarning(
             message=
             "Y has two unique values including nulls. Consider encoding the nulls for "
             "this column to be useful for machine learning.",
             data_check_name=no_variance_data_check_name,
             message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL,
             details={
                 "column": "Y"
             }).to_dict()
     ],
     "errors": [],