def validate(self, X, y):
        """Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation.

        If `method='mutual'`, supports all target and feature types. Otherwise, if `method='pearson'` only supports binary with numeric and boolean dtypes.
        Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1].

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check
            y (ww.DataColumn, pd.Series, np.ndarray): The target data

        Returns:
            dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected.

        Example:
            >>> import pandas as pd
            >>> X = pd.DataFrame({
            ...    'leak': [10, 42, 31, 51, 61],
            ...    'x': [42, 54, 12, 64, 12],
            ...    'y': [13, 5, 13, 74, 24],
            ... })
            >>> y = pd.Series([10, 42, 31, 51, 40])
            >>> target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.95)
            >>> assert target_leakage_check.validate(X, y) == {"warnings": [{"message": "Column 'leak' is 95.0% or more correlated with the target",\
                                                                             "data_check_name": "TargetLeakageDataCheck",\
                                                                             "level": "warning",\
                                                                             "code": "TARGET_LEAKAGE",\
                                                                             "details": {"column": "leak"}}],\
                                                               "errors": [],\
                                                               "actions": [{"code": "DROP_COL",\
                                                                            "metadata": {"column": "leak"}}]}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        X = infer_feature_types(X)
        y = infer_feature_types(y)

        if self.method == 'pearson':
            highly_corr_cols = self._calculate_pearson(X, y)
        else:
            X = _convert_woodwork_types_wrapper(X.to_dataframe())
            y = _convert_woodwork_types_wrapper(y.to_series())
            highly_corr_cols = self._calculate_mutual_information(X, y)

        warning_msg = "Column '{}' is {}% or more correlated with the target"
        results["warnings"].extend([
            DataCheckWarning(message=warning_msg.format(
                col_name, self.pct_corr_threshold * 100),
                             data_check_name=self.name,
                             message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                             details={
                                 "column": col_name
                             }).to_dict() for col_name in highly_corr_cols
        ])
        results["actions"].extend([
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": col_name
                            }).to_dict() for col_name in highly_corr_cols
        ])
        return results
    def fit_resample(self, X, y):
        """Resampling technique for this sampler.

        Arguments:
            X (pd.DataFrame): Training data to fit and resample
            y (pd.Series): Training data targets to fit and resample

        Returns:
            list: Indices to keep for training data
        """
        X_ww = infer_feature_types(X)
        y_ww = infer_feature_types(y)
        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        y = _convert_woodwork_types_wrapper(y_ww.to_series())
        result = self._find_ideal_samples(y)
        indices_to_drop = []
        if len(result):
            # iterate through the classes we need to undersample and remove the number of samples we need to remove
            for key, value in result.items():
                indices = y.index[y == key].values
                indices_to_remove = self.random_state.choice(indices,
                                                             value,
                                                             replace=False)
                indices_to_drop.extend(indices_to_remove)
        return list(set(list(y.index.values)).difference(set(indices_to_drop)))
示例#3
0
    def transform(self, X, y=None):
        """No transformation needs to be done here.

        Arguments:
            X (ww.DataFrame): Training features. Ignored.
            y (ww.DataColumn): Target features. Ignored.

        Returns:
            ww.DataTable, ww.DataColumn: X and y data that was passed in.
        """
        X = infer_feature_types(X)
        if y is not None:
            y = infer_feature_types(y)
        return X, y
示例#4
0
    def fit_transform(self, X, y):
        """Fit and transform the data using the data sampler. Used during training of the pipeline

        Arguments:
            X (ww.DataFrame): Training features
            y (ww.DataColumn): Target features

         Returns:
            ww.DataTable, ww.DataColumn: Sampled X and y data
        """
        self.fit(X, y)
        _, _, X_pd, y_pd = self._prepare_data(X, y)
        X_new, y_new = self._component_obj.fit_resample(X_pd, y_pd)
        return infer_feature_types(X_new), infer_feature_types(y_new)
示例#5
0
def test_ensemble_data(mock_fit, mock_score, dummy_binary_pipeline_class,
                       stackable_classifiers):
    X = pd.DataFrame({"a": [i for i in range(100)]})
    y = pd.Series([i % 2 for i in range(100)])
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_batches=19,
                          ensembling=True,
                          _ensembling_split_size=0.25)
    mock_should_continue_callback = MagicMock(return_value=True)
    mock_pre_evaluation_callback = MagicMock()
    mock_post_evaluation_callback = MagicMock()

    training_indices, ensembling_indices, _, _ = split_data(
        ww.DataTable(np.arange(X.shape[0])),
        y,
        problem_type='binary',
        test_size=0.25,
        random_seed=0)
    training_indices, ensembling_indices = training_indices.to_dataframe(
    )[0].tolist(), ensembling_indices.to_dataframe()[0].tolist()

    engine = SequentialEngine(
        X_train=infer_feature_types(X),
        y_train=infer_feature_types(y),
        ensembling_indices=ensembling_indices,
        automl=automl,
        should_continue_callback=mock_should_continue_callback,
        pre_evaluation_callback=mock_pre_evaluation_callback,
        post_evaluation_callback=mock_post_evaluation_callback)
    pipeline1 = [dummy_binary_pipeline_class({'Mock Classifier': {'a': 1}})]
    engine.evaluate_batch(pipeline1)
    # check the fit length is correct, taking into account the data splits
    assert len(mock_fit.call_args[0][0]) == int(2 / 3 * len(training_indices))

    input_pipelines = [
        make_pipeline_from_components([classifier], problem_type='binary')
        for classifier in stackable_classifiers
    ]
    pipeline2 = [
        make_pipeline_from_components(
            [StackedEnsembleClassifier(input_pipelines, n_jobs=1)],
            problem_type='binary',
            custom_name="Stacked Ensemble Classification Pipeline")
    ]
    engine.evaluate_batch(pipeline2)
    assert len(mock_fit.call_args[0][0]) == int(2 / 3 *
                                                len(ensembling_indices))
def test_samplers_perform_equally(problem_type, component_sampler, imblearn_sampler, X_y_binary, X_y_multi):
    if problem_type == 'binary':
        X, _ = X_y_binary
        y = np.array([0] * 90 + [1] * 10)
        imb_learn_sampling_ratio = 0.5
        expected_y = np.array([0] * 90 + [1] * 45)
    else:
        X, _ = X_y_multi
        y = np.array([0] * 70 + [1] * 20 + [2] * 10)
        imb_learn_sampling_ratio = {0: 70, 1: 35, 2: 35}
        expected_y = np.array([0] * 70 + [1] * 35 + [2] * 35)
    sampling_ratio = 0.5
    sampling_dic = {'sampling_ratio': sampling_ratio}
    X2 = X
    random_seed = 1
    if component_sampler != SMOTENCSampler:
        component = component_sampler(**sampling_dic, random_seed=random_seed)
        imb_sampler = imblearn_sampler(sampling_strategy=imb_learn_sampling_ratio, random_state=random_seed)
    else:
        X2 = infer_feature_types(X, feature_types={1: "Categorical", 2: "Categorical", 3: "Categorical", 4: "Categorical"})
        component = component_sampler(**sampling_dic, random_seed=random_seed)
        imb_sampler = imblearn_sampler(sampling_strategy=imb_learn_sampling_ratio, categorical_features=[1, 2, 3, 4], random_state=random_seed)

    X_com, y_com = component.fit_transform(X2, y)
    X_im, y_im = imb_sampler.fit_resample(X, y)

    np.testing.assert_equal(X_com.to_dataframe().values, X_im)
    np.testing.assert_equal(y_com.to_series().values, y_im)
    np.testing.assert_equal(sorted(y_im), expected_y)
def test_oversample_seed_same_outputs(sampler, X_y_binary):
    X, y = X_y_binary
    X = pd.DataFrame(X)
    y = pd.Series([0] * 90 + [1] * 10)

    samplers = []
    for seed in [0, 0, 1]:
        oversampler = sampler(sampling_ratio=1, random_seed=seed)
        if 'NC' in sampler.name:
            X = infer_feature_types(X, feature_types={1: "Categorical"})
            oversampler = sampler(sampling_ratio=1, random_seed=seed)
        samplers.append(oversampler)

    # iterate through different indices in samplers
    # in group 1, first two oversamplers in samplers should be equal
    # in group 2, calling same oversamplers twice should be equal
    # in group 3, last two oversamplers in samplers should be different
    for s1, s2 in [[0, 1], [1, 1], [1, 2]]:
        X1, y1 = samplers[s1].fit_transform(X, y)
        X2, y2 = samplers[s2].fit_transform(X, y)
        if s2 == 2 and sampler != SMOTENSampler:
            # group 3, SMOTENSampler performance doesn't change with different random states
            with pytest.raises(AssertionError):
                pd.testing.assert_frame_equal(X1.to_dataframe(), X2.to_dataframe())
        else:
            pd.testing.assert_frame_equal(X1.to_dataframe(), X2.to_dataframe())
        pd.testing.assert_series_equal(y1.to_series(), y2.to_series())
def test_oversample_imbalanced_binary(data_type, sampler, make_data_type):
    X = np.array([[i for i in range(1000)],
                  [i % 7 for i in range(1000)],
                  [0.3 * (i % 3) for i in range(1000)]]).T
    y = np.array([0] * 150 + [1] * 850)
    X = make_data_type(data_type, X)
    y = make_data_type(data_type, y)

    oversampler = sampler
    if oversampler.name == "SMOTENC Oversampler":
        X2 = infer_feature_types(X, feature_types={1: "Categorical"})
        if data_type == "ww":
            X2 = X2.set_types({0: "Categorical"})
        new_X, new_y = oversampler.fit_transform(X2, y)
    else:
        new_X, new_y = oversampler.fit_transform(X, y)

    new_length = 1700
    assert len(new_X) == new_length
    assert len(new_y) == new_length
    value_counts = new_y.to_series().value_counts()
    assert value_counts.values[0] == value_counts.values[1]
    pd.testing.assert_series_equal(value_counts, pd.Series([850, 850]), check_dtype=False)

    transform_X, transform_y = oversampler.transform(X, y)

    if data_type == "ww":
        X = X.to_dataframe().values
        y = y.to_series().values
    elif data_type == "pd":
        X = X.values
        y = y.values

    np.testing.assert_equal(X, transform_X.to_dataframe().values)
    np.testing.assert_equal(y, transform_y.to_series().values)
示例#9
0
    def fit_resample(self, X, y):
        """Resampling technique for this sampler.

        Arguments:
            X (pd.DataFrame): Training data to fit and resample
            y (pd.Series): Training data targets to fit and resample

        Returns:
            list: Indices to keep for training data
        """
        y_ww = infer_feature_types(y)
        y = _convert_woodwork_types_wrapper(y_ww.to_series())
        # if we have a dictionary provided, opt to use that
        if len(self.sampling_ratio_dict):
            result = self._sampling_dict_to_remove_dict(y)
        else:
            result = self._find_ideal_samples(y)
        indices_to_drop = []
        if len(result):
            # iterate through the classes we need to undersample and remove the number of samples we need to remove
            for key, value in result.items():
                indices = y.index[y == key].values
                indices_to_remove = self.random_state.choice(indices,
                                                             value,
                                                             replace=False)
                indices_to_drop.extend(indices_to_remove)
        # indices of the y datacolumn
        original_indices = list(
            set(y.index.values).difference(set(indices_to_drop)))
        return original_indices
示例#10
0
    def validate(self, X, y=None):
        """Checks if there are any columns in the input that are too unique in the case of classification
        problems or not unique enough in the case of regression problems.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features.
            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.  Defaults to None.

        Returns:
            dict: dict with a DataCheckWarning if there are any too unique or not
                unique enough columns.

        Example:
            >>> import pandas as pd
            >>> df = pd.DataFrame({
            ...    'regression_unique_enough': [float(x) for x in range(100)],
            ...    'regression_not_unique_enough': [float(1) for x in range(100)]
            ... })
            >>> uniqueness_check = UniquenessDataCheck(problem_type="regression", threshold=0.8)
            >>> assert uniqueness_check.validate(df) == {"errors": [],\
                                                         "warnings": [{"message": "Input columns (regression_not_unique_enough) for regression problem type are not unique enough.",\
                                                                 "data_check_name": "UniquenessDataCheck",\
                                                                 "level": "warning",\
                                                                 "code": "NOT_UNIQUE_ENOUGH",\
                                                                 "details": {"column": "regression_not_unique_enough", 'uniqueness_score': 0.0}}],\
                                                         "actions": []}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        X = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        res = X.apply(UniquenessDataCheck.uniqueness_score)

        if is_regression(self.problem_type):
            not_unique_enough_cols = list(res.index[res < self.threshold])
            results["warnings"].extend([
                DataCheckWarning(
                    message=warning_not_unique_enough.format(
                        col_name, self.problem_type),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH,
                    details={
                        "column": col_name,
                        "uniqueness_score": res.loc[col_name]
                    }).to_dict() for col_name in not_unique_enough_cols
            ])
        elif is_multiclass(self.problem_type):
            too_unique_cols = list(res.index[res > self.threshold])
            results["warnings"].extend([
                DataCheckWarning(message=warning_too_unique.format(
                    col_name, self.problem_type),
                                 data_check_name=self.name,
                                 message_code=DataCheckMessageCode.TOO_UNIQUE,
                                 details={
                                     "column": col_name,
                                     "uniqueness_score": res.loc[col_name]
                                 }).to_dict() for col_name in too_unique_cols
            ])
        return results
示例#11
0
 def _calculate_mutual_information(self, X, y):
     highly_corr_cols = []
     for col in X.columns:
         cols_to_compare = infer_feature_types(pd.DataFrame({col: X[col], str(col) + "y": y}))
         mutual_info = cols_to_compare.mutual_information()
         if len(mutual_info) > 0 and mutual_info['mutual_info'].iloc[0] > self.pct_corr_threshold:
             highly_corr_cols.append(col)
     return highly_corr_cols
示例#12
0
    def _prepare_data(self, X, y):
        """Transforms the input data to pandas data structure that our sampler can ingest.

        Arguments:
            X (ww.DataFrame): Training features
            y (ww.DataColumn): Target features

         Returns:
            ww.DataTable, ww.DataColumn, pd.DataFrame, pd.Series: Prepared X and y data, both woodwork and pandas
        """
        X = infer_feature_types(X)
        if y is None:
            raise ValueError("y cannot be none")
        y = infer_feature_types(y)
        X_pd = _convert_woodwork_types_wrapper(X.to_dataframe())
        y_pd = _convert_woodwork_types_wrapper(y.to_series())
        return X, y, X_pd, y_pd
示例#13
0
    def transform(self, X, y=None):
        X_return = X.to_dataframe().copy()
        X_embeded = self._component_obj.transform(
            cudf.from_pandas(X.to_dataframe()).astype('float32'))

        for i in range(len(X_embeded.columns)):
            X_return[f'component_{i}_fe'] = X_embeded[i].to_array()

        return infer_feature_types(X_return)
示例#14
0
def test_tune_binary_threshold(mock_fit, mock_score, mock_predict_proba, mock_optimize_threshold,
                               dummy_binary_pipeline_class, X_y_binary):
    mock_optimize_threshold.return_value = 0.42
    mock_score.return_value = {'F1': 1.0}
    X, y = X_y_binary
    X = infer_feature_types(X)
    y = infer_feature_types(y)

    pipeline = dummy_binary_pipeline_class({})
    tune_binary_threshold(pipeline, F1(), 'binary', X, y)
    assert pipeline.threshold == 0.42

    pipeline = dummy_binary_pipeline_class({})
    tune_binary_threshold(pipeline, F1(), 'binary', None, None)
    assert pipeline.threshold == 0.5

    pipeline = dummy_binary_pipeline_class({})
    tune_binary_threshold(pipeline, F1(), 'multiclass', X, y)
    assert pipeline.threshold is None
示例#15
0
 def split(self, X, y):
     """Splits and returns the indices of the training and testing using the data sampler provided.
     Arguments:
             X (ww.DataTable): DataTable of points to split
             y (ww.DataTable): DataColumn of points to split
     Returns:
         tuple(train, test): A tuple containing the resulting train and test indices, post sampling.
     """
     X_ww = infer_feature_types(X)
     y_ww = infer_feature_types(y)
     X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
     y = _convert_woodwork_types_wrapper(y_ww.to_series())
     index_df = pd.Series(y.index)
     for train, test in self.splitter.split(X, y):
         X_train, y_train = X.iloc[train], y.iloc[train]
         train_index_drop = self.sampler.fit_resample(X_train, y_train)
         # convert the indices of the y column into index indices of the original pre-split y
         train_indices = index_df[index_df.isin(train_index_drop)].dropna().index.values.tolist()
         yield iter([train_indices, test])
示例#16
0
def test_none_y(sampler):
    X = pd.DataFrame({"a": [i for i in range(5)],
                      "b": [1 for i in range(5)]})
    X = infer_feature_types(X, feature_types={"a": "Categorical"})
    oversampler = sampler
    with pytest.raises(ValueError, match="y cannot be none"):
        oversampler.fit(X, None)
    with pytest.raises(ValueError, match="y cannot be none"):
        oversampler.fit_transform(X, None)
    oversampler.fit(X, pd.Series([0] * 4 + [1]))
    oversampler.transform(X, None)
示例#17
0
    def validate(self, X, y=None):
        """Calculates what percentage of each column's unique values exceed the count threshold and compare
        that percentage to the sparsity threshold stored in the class instance.
        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features.
            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.
        Returns:
            dict: dict with a DataCheckWarning if there are any sparse columns.
        Example:
            >>> import pandas as pd
            >>> df = pd.DataFrame({
            ...    'sparse': [float(x) for x in range(100)],
            ...    'not_sparse': [float(1) for x in range(100)]
            ... })
            >>> sparsity_check = SparsityDataCheck(problem_type="multiclass", threshold=0.5, unique_count_threshold=10)
            >>> assert sparsity_check.validate(df) == {"errors": [],\
                                                       "warnings": [{"message": "Input columns (sparse) for multiclass problem type are too sparse.",\
                                                            "data_check_name": "SparsityDataCheck",\
                                                            "level": "warning",\
                                                            "code": "TOO_SPARSE",\
                                                            "details": {"column": "sparse", 'sparsity_score': 0.0}}],\
                                                       "actions": [{"code": "DROP_COL",\
                                                                 "metadata": {"column": "sparse"}}]}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        X = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        res = X.apply(SparsityDataCheck.sparsity_score,
                      count_threshold=self.unique_count_threshold)
        too_sparse_cols = [col for col in res.index[res < self.threshold]]
        results["warnings"].extend([
            DataCheckWarning(message=warning_too_unique.format(
                col_name, self.problem_type),
                             data_check_name=self.name,
                             message_code=DataCheckMessageCode.TOO_SPARSE,
                             details={
                                 "column": col_name,
                                 "sparsity_score": res.loc[col_name]
                             }).to_dict() for col_name in too_sparse_cols
        ])
        results["actions"].extend([
            DataCheckAction(action_code=DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": col_name
                            }).to_dict() for col_name in too_sparse_cols
        ])
        return results
示例#18
0
 def transform_sample(self, X, y):
     """Transforms the input data with the balancing strategy.
         Arguments:
             X (ww.DataTable): DataTable of points to split
             y (ww.DataTable): DataColumn of points to split
         Returns:
             list: List of indices to keep
     """
     y_ww = infer_feature_types(y)
     y = _convert_woodwork_types_wrapper(y_ww.to_series())
     index_df = pd.Series(y.index)
     train_index_drop = self.sampler.fit_resample(X, y)
     # convert the indices of the y column into index indices of the original pre-split y
     train_indices = index_df[index_df.isin(train_index_drop)].dropna().index.values.tolist()
     return train_indices
示例#19
0
    def validate(self, X, y=None):
        """Checks if any natural language columns contain NaN values.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features.
            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.  Defaults to None.

        Returns:
            dict: dict with a DataCheckError if NaN values are present in natural language columns.

        Example:
            >>> import pandas as pd
            >>> import woodwork as ww
            >>> import numpy as np
            >>> data = pd.DataFrame()
            >>> data['A'] = [None, "string_that_is_long_enough_for_natural_language"]
            >>> data['B'] = ['string_that_is_long_enough_for_natural_language', 'string_that_is_long_enough_for_natural_language']
            >>> data['C'] = np.random.randint(0, 3, size=len(data))
            >>> data = ww.DataTable(data, logical_types={'A': 'NaturalLanguage', 'B': 'NaturalLanguage'})
            >>> nl_nan_check = NaturalLanguageNaNDataCheck()
            >>> assert nl_nan_check.validate(data) == {
            ...        "warnings": [],
            ...        "actions": [],
            ...        "errors": [DataCheckError(message='Input natural language column(s) (A) contains NaN values. Please impute NaN values or drop these rows or columns.',
            ...                      data_check_name=NaturalLanguageNaNDataCheck.name,
            ...                      message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN,
            ...                      details={"columns": 'A'}).to_dict()]
            ...    }
        """
        results = {
            "warnings": [],
            "errors": [],
            "actions": []
        }

        X = infer_feature_types(X)
        X = X.select('natural_language')
        X_describe = X.describe_dict()
        nan_columns = [str(col) for col in X_describe if X_describe[col]['nan_count'] > 0]
        if len(nan_columns) > 0:
            cols_str = ', '.join(nan_columns)
            results["errors"].append(DataCheckError(message=error_contains_nan.format(cols_str),
                                                    data_check_name=self.name,
                                                    message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN,
                                                    details={"columns": cols_str}).to_dict())
        return results
示例#20
0
def test_smotenc_output_shape(X_y_binary):
    X, y = X_y_binary
    y_imbalanced = pd.Series([0] * 90 + [1] * 10)
    X_ww = infer_feature_types(X, feature_types={0: 'Categorical', 1: 'Categorical'})
    snc = SMOTENCSampler()
    with pytest.raises(ComponentNotYetFittedError, match=f'You must fit SMOTENCSampler'):
        snc.transform(X_ww, y)
    # test sampling and no sampling
    for y_value in [y, y_imbalanced]:
        snc.fit(X_ww, y_value)
        X_out, y_out = snc.transform(X_ww, y_value)
        assert X_out.shape[1] == X_ww.shape[1]
        assert y_out.shape[0] == X_out.shape[0]

        X_out, y_out = snc.fit_transform(X_ww, y)
        assert X_out.shape[1] == X_ww.shape[1]
        assert y_out.shape[0] == X_out.shape[0]
示例#21
0
    def validate(self, X, y=None):
        """Checks if any datetime columns contain NaN values.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features.
            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.  Defaults to None.

        Returns:
            dict: dict with a DataCheckError if NaN values are present in datetime columns.

        Example:
            >>> import pandas as pd
            >>> import woodwork as ww
            >>> import numpy as np
            >>> dates = np.arange(np.datetime64('2017-01-01'), np.datetime64('2017-01-08'))
            >>> dates[0] = np.datetime64('NaT')
            >>> ww_input = ww.DataTable(pd.DataFrame(dates, columns=['index']))
            >>> dt_nan_check = DateTimeNaNDataCheck()
            >>> assert dt_nan_check.validate(ww_input) == {"warnings": [],
            ...                                             "actions": [],
            ...                                             "errors": [DataCheckError(message='Input datetime column(s) (index) contains NaN values. Please impute NaN values or drop these rows or columns.',
            ...                                                                     data_check_name=DateTimeNaNDataCheck.name,
            ...                                                                     message_code=DataCheckMessageCode.DATETIME_HAS_NAN,
            ...                                                                     details={"columns": 'index'}).to_dict()]}
        """
        results = {
            "warnings": [],
            "errors": [],
            "actions": []
        }

        X = infer_feature_types(X)
        datetime_cols = _convert_woodwork_types_wrapper(X.select("datetime").to_dataframe())
        nan_columns = datetime_cols.columns[datetime_cols.isna().any()].tolist()
        if len(nan_columns) > 0:
            nan_columns = [str(col) for col in nan_columns]
            cols_str = ', '.join(nan_columns) if len(nan_columns) > 1 else nan_columns[0]
            results["errors"].append(DataCheckError(message=error_contains_nan.format(cols_str),
                                                    data_check_name=self.name,
                                                    message_code=DataCheckMessageCode.DATETIME_HAS_NAN,
                                                    details={"columns": cols_str}).to_dict())
        return results
示例#22
0
def test_no_oversample(data_type, sampler, make_data_type, X_y_binary):
    X, y = X_y_binary
    X = make_data_type(data_type, X)
    y = make_data_type(data_type, y)

    oversampler = sampler
    if oversampler.name == "SMOTENC Oversampler":
        X2 = infer_feature_types(X, feature_types={1: "Categorical"})
        if data_type == "ww":
            X2 = X2.set_types({0: "Categorical"})
        new_X, new_y = oversampler.fit_transform(X2, y)
    else:
        new_X, new_y = oversampler.fit_transform(X, y)

    if data_type == "ww":
        X = X.to_dataframe().values
        y = y.to_series().values
    elif data_type == "pd":
        X = X.values
        y = y.values

    np.testing.assert_equal(X, new_X.to_dataframe().values)
    np.testing.assert_equal(y, new_y.to_series().values)
示例#23
0
def test_oversample_imbalanced_multiclass(data_type, sampler, sampling_ratio, make_data_type):
    X = np.array([[i for i in range(1000)],
                  [i % 7 for i in range(1000)],
                  [0.3 * (i % 3) for i in range(1000)]]).T
    y = np.array([0] * 800 + [1] * 100 + [2] * 100)
    X = make_data_type(data_type, X)
    y = make_data_type(data_type, y)
    X2 = X
    oversampler = sampler(sampling_ratio=sampling_ratio)
    if sampler.name == 'SMOTENC Oversampler':
        X2 = infer_feature_types(X, feature_types={0: "Categorical"})
        if data_type == "ww":
            X2 = X2.set_types({0: "Categorical"})
        oversampler = sampler(sampling_ratio=sampling_ratio)

    new_X, new_y = oversampler.fit_transform(X2, y)

    num_samples = [800, 800 * sampling_ratio, 800 * sampling_ratio]
    # check the lengths and sampled values are as we expect
    assert len(new_X) == sum(num_samples)
    assert len(new_y) == sum(num_samples)
    value_counts = new_y.to_series().value_counts()
    assert value_counts.values[1] == value_counts.values[2]
    np.testing.assert_equal(value_counts.values, np.array(num_samples))

    transform_X, transform_y = oversampler.transform(X2, y)

    if data_type == "ww":
        X = X.to_dataframe().values
        y = y.to_series().values
    elif data_type == "pd":
        X = X.values
        y = y.values

    np.testing.assert_equal(X, transform_X.to_dataframe().values)
    np.testing.assert_equal(y, transform_y.to_series().values)
示例#24
0
 def predict_proba(self, X):
     predictions_pandas = self._component_obj.predict_proba(
         cudf.DataFrame.from_pandas(
             X.to_dataframe().astype('float32'))).to_pandas()
     return infer_feature_types(predictions_pandas)
示例#25
0
 def feature_importance(self):
     return infer_feature_types(
         self._component_obj.feature_importances_.to_pandas())
示例#26
0
    def validate(self, X, y):
        """Checks if the target data contains missing or invalid values.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored.
            y (ww.DataColumn, pd.Series, np.ndarray): Target data to check for invalid values.

        Returns:
            dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data.

        Example:
            >>> import pandas as pd
            >>> X = pd.DataFrame({"col": [1, 2, 3, 1]})
            >>> y = pd.Series([0, 1, None, None])
            >>> target_check = InvalidTargetDataCheck('binary', 'Log Loss Binary')
            >>> assert target_check.validate(X, y) == {"errors": [{"message": "2 row(s) (50.0%) of target values are null",\
                                                                   "data_check_name": "InvalidTargetDataCheck",\
                                                                   "level": "error",\
                                                                   "code": "TARGET_HAS_NULL",\
                                                                   "details": {"num_null_rows": 2, "pct_null_rows": 50}}],\
                                                       "warnings": [],\
                                                       "actions": [{'code': 'IMPUTE_COL', 'metadata': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        if y is None:
            results["errors"].append(
                DataCheckError(
                    message="Target is None",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_IS_NONE,
                    details={}).to_dict())
            return results

        y = infer_feature_types(y)
        is_supported_type = y.logical_type in numeric_and_boolean_ww + [
            ww.logical_types.Categorical
        ]
        if not is_supported_type:
            results["errors"].append(
                DataCheckError(
                    message=
                    "Target is unsupported {} type. Valid Woodwork logical types include: {}"
                    .format(
                        y.logical_type, ", ".join([
                            ltype.type_string
                            for ltype in numeric_and_boolean_ww
                        ])),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
                    details={
                        "unsupported_type": y.logical_type.type_string
                    }).to_dict())
        y_df = _convert_woodwork_types_wrapper(y.to_series())
        null_rows = y_df.isnull()
        if null_rows.all():
            results["errors"].append(
                DataCheckError(message="Target is either empty or fully null.",
                               data_check_name=self.name,
                               message_code=DataCheckMessageCode.
                               TARGET_IS_EMPTY_OR_FULLY_NULL,
                               details={}).to_dict())
            return results
        elif null_rows.any():
            num_null_rows = null_rows.sum()
            pct_null_rows = null_rows.mean() * 100
            results["errors"].append(
                DataCheckError(
                    message="{} row(s) ({}%) of target values are null".format(
                        num_null_rows, pct_null_rows),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                    details={
                        "num_null_rows": num_null_rows,
                        "pct_null_rows": pct_null_rows
                    }).to_dict())
            impute_strategy = "mean" if is_regression(
                self.problem_type) else "most_frequent"
            results["actions"].append(
                DataCheckAction(DataCheckActionCode.IMPUTE_COL,
                                metadata={
                                    "column": None,
                                    "is_target": True,
                                    "impute_strategy": impute_strategy
                                }).to_dict())

        value_counts = y_df.value_counts()
        unique_values = value_counts.index.tolist()

        if is_binary(self.problem_type) and len(value_counts) != 2:
            if self.n_unique is None:
                details = {"target_values": unique_values}
            else:
                details = {
                    "target_values":
                    unique_values[:min(self.n_unique, len(unique_values))]
                }
            results["errors"].append(
                DataCheckError(
                    message=
                    "Binary class targets require exactly two unique values.",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.
                    TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                    details=details).to_dict())

        if self.problem_type == ProblemTypes.REGRESSION and "numeric" not in y.semantic_tags:
            results["errors"].append(
                DataCheckError(
                    message=
                    "Target data type should be numeric for regression type problems.",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
                    details={}).to_dict())

        if is_multiclass(self.problem_type):
            if value_counts.min() <= 1:
                least_populated = value_counts[value_counts <= 1]
                details = {
                    "least_populated_class_labels":
                    least_populated.index.tolist()
                }
                results["errors"].append(
                    DataCheckError(
                        message=
                        "Target does not have at least two instances per class which is required for multiclass classification",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.
                        TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS,
                        details=details).to_dict())
            if len(unique_values) <= 2:
                details = {"num_classes": len(unique_values)}
                results["errors"].append(
                    DataCheckError(
                        message=
                        "Target has two or less classes, which is too few for multiclass problems.  Consider changing to binary.",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.
                        TARGET_MULTICLASS_NOT_ENOUGH_CLASSES,
                        details=details).to_dict())

            num_class_to_num_value_ratio = len(unique_values) / len(y)
            if num_class_to_num_value_ratio >= self.multiclass_continuous_threshold:
                details = {
                    "class_to_value_ratio": num_class_to_num_value_ratio
                }
                results["warnings"].append(
                    DataCheckWarning(
                        message=
                        "Target has a large number of unique values, could be regression type problem.",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.
                        TARGET_MULTICLASS_HIGH_UNIQUE_CLASS,
                        details=details).to_dict())

        any_neg = not (y_df > 0).all() if y.logical_type in [
            ww.logical_types.Integer, ww.logical_types.Double
        ] else None
        if any_neg and self.objective.positive_only:
            details = {
                "Count of offending values":
                sum(val <= 0 for val in y_df.values.flatten())
            }
            results["errors"].append(
                DataCheckError(
                    message=
                    f"Target has non-positive values which is not supported for {self.objective.name}",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.
                    TARGET_INCOMPATIBLE_OBJECTIVE,
                    details=details).to_dict())

        if X is not None:
            X = infer_feature_types(X)
            X_index = list(X.to_dataframe().index)
            y_index = list(y_df.index)
            X_length = len(X_index)
            y_length = len(y_index)
            if X_length != y_length:
                results["warnings"].append(
                    DataCheckWarning(
                        message=
                        "Input target and features have different lengths",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.MISMATCHED_LENGTHS,
                        details={
                            "features_length": X_length,
                            "target_length": y_length
                        }).to_dict())

            if X_index != y_index:
                if set(X_index) == set(y_index):
                    results["warnings"].append(
                        DataCheckWarning(
                            message=
                            "Input target and features have mismatched indices order",
                            data_check_name=self.name,
                            message_code=DataCheckMessageCode.
                            MISMATCHED_INDICES_ORDER,
                            details={}).to_dict())
                else:
                    index_diff_not_in_X = list(set(y_index) -
                                               set(X_index))[:10]
                    index_diff_not_in_y = list(set(X_index) -
                                               set(y_index))[:10]
                    results["warnings"].append(
                        DataCheckWarning(
                            message=
                            "Input target and features have mismatched indices",
                            data_check_name=self.name,
                            message_code=DataCheckMessageCode.
                            MISMATCHED_INDICES,
                            details={
                                "indices_not_in_features": index_diff_not_in_X,
                                "indices_not_in_target": index_diff_not_in_y
                            }).to_dict())

        return results
示例#27
0
 def _get_categorical(self, X):
     X = infer_feature_types(X)
     self.categorical_features = [i for i, val in enumerate(X.types['Logical Type'].items()) if str(val[1]) == 'Categorical']
     self._parameters['categorical_features'] = self.categorical_features
示例#28
0
def test_smotenc_categorical_features(X_y_binary):
    X, y = X_y_binary
    X_ww = infer_feature_types(X, feature_types={0: 'Categorical', 1: 'Categorical'})
    snc = SMOTENCSampler()
    X_out, y_out = snc.fit_transform(X_ww, y)
    assert snc.categorical_features == [0, 1]