示例#1
0
    def predict(self, X, y=None, objective=None):
        """Make predictions using selected features.

        Arguments:
            X (ww.DataTable, pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series, np.ndarray, None): The target training targets of length [n_samples]
            objective (Object or string): The objective to use to make predictions

        Returns:
            pd.Series: Predicted values.
        """
        if X is None:
            X = pd.DataFrame()
        X = _convert_to_woodwork_structure(X)
        y = _convert_to_woodwork_structure(y)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_woodwork_types_wrapper(y.to_series())
        features = self.compute_estimator_features(X, y)
        features_no_nan, y = drop_rows_with_nans(features, y)
        y_arg = None
        if self.estimator.predict_uses_y:
            y_arg = y
        predictions = self.estimator.predict(features_no_nan, y_arg)
        predictions = predictions.rename(self.input_target_name)
        return pad_with_nans(predictions,
                             max(0, features.shape[0] - predictions.shape[0]))
示例#2
0
    def score(self, X, y, objectives):
        """Evaluate model performance on current and additional objectives.

        Arguments:
            X (ww.DataTable, pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]
            y (pd.Series, ww.DataColumn): True labels of length [n_samples]
            objectives (list): Non-empty list of objectives to score on

        Returns:
            dict: Ordered dictionary of objective scores
        """
        # Only converting X for the call to _score_all_objectives
        if X is None:
            X = pd.DataFrame()
        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_to_woodwork_structure(y)
        y = _convert_woodwork_types_wrapper(y.to_series())

        y_predicted = self.predict(X, y)
        y_shifted = y.shift(-self.gap)
        objectives = [
            get_objective(o, return_instance=True) for o in objectives
        ]
        y_shifted, y_predicted = drop_rows_with_nans(y_shifted, y_predicted)
        return self._score_all_objectives(X,
                                          y_shifted,
                                          y_predicted,
                                          y_pred_proba=None,
                                          objectives=objectives)
    def score(self, X, y, objectives):
        """Evaluate model performance on current and additional objectives.

        Arguments:
            X (ww.DataTable, pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series): True labels of length [n_samples]
            objectives (list): Non-empty list of objectives to score on

        Returns:
            dict: Ordered dictionary of objective scores
        """
        X, y = self._convert_to_woodwork(X, y)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_woodwork_types_wrapper(y.to_series())
        objectives = [get_objective(o, return_instance=True) for o in objectives]

        y_encoded = self._encode_targets(y)
        y_shifted = y_encoded.shift(-self.gap)
        y_predicted, y_predicted_proba = self._compute_predictions(X, y, objectives, time_series=True)
        if y_predicted is not None:
            y_predicted = _convert_woodwork_types_wrapper(y_predicted.to_series())
        if y_predicted_proba is not None:
            y_predicted_proba = _convert_woodwork_types_wrapper(y_predicted_proba.to_dataframe())
        y_shifted, y_predicted, y_predicted_proba = drop_rows_with_nans(y_shifted, y_predicted, y_predicted_proba)
        return self._score_all_objectives(X, y_shifted, y_predicted,
                                          y_pred_proba=y_predicted_proba,
                                          objectives=objectives)
示例#4
0
    def validate(self, X, y):
        """Check if the target or any of the features have no variance (1 unique value).

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): The input features.
            y (ww.DataColumn, pd.Series, np.ndarray): The target data.

        Returns:
            dict: dict of warnings/errors corresponding to features or target with no variance.
        """
        messages = {"warnings": [], "errors": []}

        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_to_woodwork_structure(y)
        y = _convert_woodwork_types_wrapper(y.to_series())

        unique_counts = X.nunique(dropna=self._dropnan).to_dict()
        any_nulls = (X.isnull().any()).to_dict()
        for name in unique_counts:
            message = self._check_for_errors(name, unique_counts[name],
                                             any_nulls[name])
            if not message:
                continue
            DataCheck._add_message(message, messages)
        y_name = getattr(y, "name")
        if not y_name:
            y_name = "Y"
        target_message = self._check_for_errors(
            y_name, y.nunique(dropna=self._dropnan),
            y.isnull().any())
        if target_message:
            DataCheck._add_message(target_message, messages)
        return messages
示例#5
0
    def fit_transform(self, X, y=None):
        """Fits on X and transforms X

        Arguments:
            X (pd.DataFrame): Data to fit and transform
            y (pd. DataFrame): Target data
        Returns:
            pd.DataFrame: Transformed X
        """
        try:
            X2 = _convert_to_woodwork_structure(X)
            y2 = _convert_to_woodwork_structure(y)
            X2 = _convert_woodwork_types_wrapper(X2.to_dataframe())
            y2 = _convert_woodwork_types_wrapper(y2.to_series())
            X_t = self._component_obj.fit_transform(X2, y2)
        except AttributeError:
            try:
                self.fit(X, y)
                X_t = self.transform(X, y)
            except MethodPropertyNotFoundError as e:
                raise e

        if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame):
            return pd.DataFrame(X_t, columns=X.columns, index=X.index)
        return pd.DataFrame(X_t)
示例#6
0
 def _manage_woodwork(self, X, y=None):
     """Function to convert the input and target data to Pandas data structures."""
     X = _convert_to_woodwork_structure(X)
     X = _convert_woodwork_types_wrapper(X.to_dataframe())
     if y is not None:
         y = _convert_to_woodwork_structure(y)
         y = _convert_woodwork_types_wrapper(y.to_series())
     return X, y
 def _calculate_pearson(self, X, y):
     highly_corr_cols = []
     X_num = X.select(include=numeric_and_boolean_ww)
     if y.logical_type not in numeric_and_boolean_ww or len(X_num.columns) == 0:
         return highly_corr_cols
     X_num = _convert_woodwork_types_wrapper(X_num.to_dataframe())
     y = _convert_woodwork_types_wrapper(y.to_series())
     highly_corr_cols = [label for label, col in X_num.iteritems() if abs(y.corr(col)) >= self.pct_corr_threshold]
     return highly_corr_cols
示例#8
0
    def fit(self, X, y=None):
        X = _convert_to_woodwork_structure(X)
        cat_cols = list(X.select('category').columns)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        y = _convert_to_woodwork_structure(y)
        y = _convert_woodwork_types_wrapper(y.to_series())

        self._component_obj.fit(X, y, silent=True, cat_features=cat_cols)
        return self
示例#9
0
    def transform(self, X, y=None):
        """Computes the delayed features for all features in X and y.

        For each feature in X, it will add a column to the output dataframe for each
        delay in the (inclusive) range [1, max_delay]. The values of each delayed feature are simply the original
        feature shifted forward in time by the delay amount. For example, a delay of 3 units means that the feature
        value at row n will be taken from the n-3rd row of that feature

        If y is not None, it will also compute the delayed values for the target variable.

        Arguments:
            X (ww.DataTable, pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used.
            y (ww.DataColumn, pd.Series, or None): Target.

        Returns:
            ww.DataTable: Transformed X.
        """
        if X is None:
            X = pd.DataFrame()
        # Normalize the data into pandas objects
        X = _convert_to_woodwork_structure(X)

        categorical_columns = self._get_categorical_columns(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        if self.delay_features and len(X) > 0:
            X_categorical = self._encode_X_while_preserving_index(
                X[categorical_columns])
            for col_name in X:
                col = X[col_name]
                if col_name in categorical_columns:
                    col = X_categorical[col_name]
                X = X.assign(
                    **{
                        f"{col_name}_delay_{t}": col.shift(t)
                        for t in range(1, self.max_delay + 1)
                    })

        # Handle cases where the target was passed in
        if self.delay_target and y is not None:
            y = _convert_to_woodwork_structure(y)
            if y.logical_type == logical_types.Categorical:
                y = self._encode_y_while_preserving_index(y)
            else:
                y = _convert_woodwork_types_wrapper(y.to_series())
            X = X.assign(
                **{
                    f"target_delay_{t}": y.shift(t)
                    for t in range(self.start_delay_for_target,
                                   self.max_delay + 1)
                })

        return _convert_to_woodwork_structure(X)
示例#10
0
    def fit(self, X, y=None):
        X = _convert_to_woodwork_structure(X)
        cat_cols = list(X.select('category').columns)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_to_woodwork_structure(y)
        y = _convert_woodwork_types_wrapper(y.to_series())

        # For binary classification, catboost expects numeric values, so encoding before.
        if y.nunique() <= 2:
            self._label_encoder = LabelEncoder()
            y = pd.Series(self._label_encoder.fit_transform(y))
        self._component_obj.fit(X, y, silent=True, cat_features=cat_cols)
        return self
示例#11
0
    def fit_transform(self, X, y=None):
        X = _convert_to_woodwork_structure(X)
        if not is_all_numeric(X):
            raise ValueError("LDA input must be all numeric")
        y = _convert_to_woodwork_structure(y)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_woodwork_types_wrapper(y.to_series())

        X_t = self._component_obj.fit_transform(X, y)
        return pd.DataFrame(
            X_t,
            index=X.index,
            columns=[f"component_{i}" for i in range(X_t.shape[1])])
    def validate(self, X, y):
        """Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation.

        If `method='mutual'`, supports all target and feature types. Otherwise, if `method='pearson'` only supports binary with numeric and boolean dtypes.
        Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1].

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check
            y (ww.DataColumn, pd.Series, np.ndarray): The target data

        Returns:
            dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected.

        Example:
            >>> import pandas as pd
            >>> X = pd.DataFrame({
            ...    'leak': [10, 42, 31, 51, 61],
            ...    'x': [42, 54, 12, 64, 12],
            ...    'y': [13, 5, 13, 74, 24],
            ... })
            >>> y = pd.Series([10, 42, 31, 51, 40])
            >>> target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.95)
            >>> assert target_leakage_check.validate(X, y) == {"warnings": [{"message": "Column 'leak' is 95.0% or more correlated with the target",\
                                                                             "data_check_name": "TargetLeakageDataCheck",\
                                                                             "level": "warning",\
                                                                             "code": "TARGET_LEAKAGE",\
                                                                             "details": {"column": "leak"}}],\
                                                               "errors": []}
        """
        messages = {
            "warnings": [],
            "errors": []
        }
        X = _convert_to_woodwork_structure(X)
        y = _convert_to_woodwork_structure(y)

        if self.method == 'pearson':
            highly_corr_cols = self._calculate_pearson(X, y)
        else:
            X = _convert_woodwork_types_wrapper(X.to_dataframe())
            y = _convert_woodwork_types_wrapper(y.to_series())
            highly_corr_cols = self._calculate_mutual_information(X, y)

        warning_msg = "Column '{}' is {}% or more correlated with the target"
        messages["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name, self.pct_corr_threshold * 100),
                                                      data_check_name=self.name,
                                                      message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                                      details={"column": col_name}).to_dict()
                                     for col_name in highly_corr_cols])
        return messages
示例#13
0
 def fit(self, X, y=None):
     X_encoded = self._encode_categories(X, fit=True)
     if y is not None:
         y = _convert_to_woodwork_structure(y)
         y = _convert_woodwork_types_wrapper(y.to_series())
     self._component_obj.fit(X_encoded, y)
     return self
示例#14
0
    def transform(self, X, y=None):
        """One-hot encode the input data.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Features to one-hot encode.
            y (ww.DataColumn, pd.Series): Ignored.

        Returns:
            ww.DataTable: Transformed data, where each categorical feature has been encoded into numerical columns using one-hot encoding.
        """
        X_copy = _convert_to_woodwork_structure(X)
        X_copy = _convert_woodwork_types_wrapper(X_copy.to_dataframe())
        X_copy = self._handle_parameter_handle_missing(X_copy)

        X_t = pd.DataFrame()
        # Add the non-categorical columns, untouched
        for col in X_copy.columns:
            if col not in self.features_to_encode:
                X_t = pd.concat([X_t, X_copy[col]], axis=1)
        # The call to pd.concat above changes the type of the index so we will manually keep it the same.
        if not X_t.empty:
            X_t.index = X_copy.index

        # Call sklearn's transform on the categorical columns
        if len(self.features_to_encode) > 0:
            X_cat = pd.DataFrame(self._encoder.transform(X_copy[self.features_to_encode]).toarray(), index=X_copy.index)
            X_cat.columns = self.get_feature_names()
            X_t = pd.concat([X_t, X_cat], axis=1)

        return _convert_to_woodwork_structure(X_t)
示例#15
0
    def transform(self, X, y=None):
        """Transforms data X by creating new features using existing text columns

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X = _convert_to_woodwork_structure(X)
        if self._features is None or len(self._features) == 0:
            return X
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        text_columns = self._get_text_columns(X)
        es = self._make_entity_set(X, text_columns)
        X_nlp_primitives = ft.calculate_feature_matrix(features=self._features,
                                                       entityset=es)
        if X_nlp_primitives.isnull().any().any():
            X_nlp_primitives.fillna(0, inplace=True)
        X_lsa = self._lsa.transform(X[text_columns]).to_dataframe()
        X_nlp_primitives.set_index(X.index, inplace=True)
        X_t = pd.concat(
            [X.drop(text_columns, axis=1), X_nlp_primitives, X_lsa], axis=1)
        return _convert_to_woodwork_structure(X_t)
示例#16
0
    def transform(self, X, y=None):
        """Transforms data X by creating new features using existing DateTime columns, and then dropping those DateTime columns

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        X_t = X
        features_to_extract = self.parameters["features_to_extract"]
        if len(features_to_extract) == 0:
            return _convert_to_woodwork_structure(X_t)
        for col_name in self._date_time_col_names:
            for feature in features_to_extract:
                name = f"{col_name}_{feature}"
                features, categories = self._function_mappings[feature](
                    X_t[col_name], self.encode_as_categories)
                X_t[name] = features
                if categories:
                    self._categories[name] = categories
        X_t = X_t.drop(self._date_time_col_names, axis=1)
        return _convert_to_woodwork_structure(X_t)
示例#17
0
    def transform(self, X, y=None):
        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        self._check_input_for_columns(X)

        cols = self.parameters.get("columns") or []
        return self._modify_columns(cols, X, y)
示例#18
0
    def transform(self, X, y=None):
        """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are
            treated as the same.

        Arguments:
            X (pd.DataFrame): Data to transform
            y (pd.Series, optional): Ignored.

        Returns:
            pd.DataFrame: Transformed X
        """
        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        X_null_dropped = X.copy()
        X_null_dropped.drop(self._all_null_cols, inplace=True, axis=1, errors='ignore')
        if X_null_dropped.empty:
            return X_null_dropped

        if self._numeric_cols is not None and len(self._numeric_cols) > 0:
            X_numeric = X_null_dropped[self._numeric_cols]
            imputed = self._numeric_imputer.transform(X_numeric)
            imputed.index = X_null_dropped.index
            X_null_dropped[X_numeric.columns] = imputed

        if self._categorical_cols is not None and len(self._categorical_cols) > 0:
            X_categorical = X_null_dropped[self._categorical_cols]
            imputed = self._categorical_imputer.transform(X_categorical)
            imputed.index = X_null_dropped.index
            X_null_dropped[X_categorical.columns] = imputed

        return X_null_dropped
示例#19
0
    def transform(self, X, y=None):
        """Transforms input data by selecting features. If the component_obj does not have a transform method, will raise an MethodPropertyNotFoundError exception.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform.
            y (ww.DataColumn, pd.Series, optional): Target data. Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        self.input_feature_names = list(X.columns.values)

        try:
            X_t = self._component_obj.transform(X)
        except AttributeError:
            raise MethodPropertyNotFoundError(
                "Feature selector requires a transform method or a component_obj that implements transform"
            )

        X_dtypes = X.dtypes.to_dict()
        selected_col_names = self.get_names()
        col_types = {key: X_dtypes[key] for key in selected_col_names}
        features = pd.DataFrame(X_t, columns=selected_col_names,
                                index=X.index).astype(col_types)
        return _convert_to_woodwork_structure(features)
示例#20
0
    def transform(self, X, y=None):
        """Transforms input by imputing missing values. 'None' and np.nan values are treated as the same.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
        if (X.dtypes == bool).all():
            return _convert_to_woodwork_structure(X)

        X_null_dropped = X.copy()
        X_null_dropped.drop(self._all_null_cols, axis=1, errors='ignore', inplace=True)
        X_t = self._component_obj.transform(X)
        if X_null_dropped.empty:
            X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns)
            return _convert_to_woodwork_structure(X_t)

        X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns)
        X_t = X_t.infer_objects()
        X_t.index = X_null_dropped.index
        return _convert_to_woodwork_structure(X_t)
示例#21
0
    def fit(self, X, y=None):
        """Fits imputers on input data

        Arguments:
            X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to fit.
            y (ww.DataColumn, pd.Series, optional): The target training data of length [n_samples]. Ignored.

        Returns:
            self
        """
        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        self.imputers = dict()
        for column in X.columns:
            strategy_dict = self.impute_strategies.get(column, dict())
            strategy = strategy_dict.get('impute_strategy',
                                         self.default_impute_strategy)
            fill_value = strategy_dict.get('fill_value', None)
            self.imputers[column] = SimpleImputer(impute_strategy=strategy,
                                                  fill_value=fill_value)

        for column, imputer in self.imputers.items():
            imputer.fit(X[[column]])

        return self
示例#22
0
    def fit(self, X, y):
        X = _convert_to_woodwork_structure(X)
        if not is_all_numeric(X):
            raise ValueError("LDA input must be all numeric")
        y = _convert_to_woodwork_structure(y)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_woodwork_types_wrapper(y.to_series())
        n_features = X.shape[1]
        n_classes = y.nunique()
        n_components = self.parameters['n_components']
        if n_components is not None and n_components > min(
                n_classes, n_features):
            raise ValueError(f"n_components value {n_components} is too large")

        self._component_obj.fit(X, y)
        return self
示例#23
0
    def fit(self, X, y=None):
        """Fits imputer to data. 'None' values are converted to np.nan before imputation and are
            treated as the same.

        Arguments:
            X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]
            y (pd.Series, optional): The target training data of length [n_samples]

        Returns:
            self
        """
        X = _convert_to_woodwork_structure(X)
        cat_cols = list(X.select('category').columns)
        numeric_cols = list(X.select('numeric').columns)

        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        self._all_null_cols = set(X.columns) - set(X.dropna(axis=1, how='all').columns)
        X_copy = X.copy()
        X_null_dropped = X_copy.drop(self._all_null_cols, axis=1, errors='ignore')

        X_numerics = X_null_dropped[[col for col in numeric_cols if col not in self._all_null_cols]]
        if len(X_numerics.columns) > 0:
            self._numeric_imputer.fit(X_numerics, y)
            self._numeric_cols = X_numerics.columns

        X_categorical = X_null_dropped[[col for col in cat_cols if col not in self._all_null_cols]]
        if len(X_categorical.columns) > 0:
            self._categorical_imputer.fit(X_categorical, y)
            self._categorical_cols = X_categorical.columns
        return self
示例#24
0
 def fit(self, X, y=None):
     X = _convert_to_woodwork_structure(X)
     if not is_all_numeric(X):
         raise ValueError("PCA input must be all numeric")
     X = _convert_woodwork_types_wrapper(X.to_dataframe())
     self._component_obj.fit(X)
     return self
示例#25
0
def test_convert_woodwork_types_wrapper_dataframe():
    X = pd.DataFrame({"Int series": pd.Series([1, 2, 3], dtype="Int64"),
                      "Int array": pd.array([1, 2, 3], dtype="Int64"),
                      "Int series with nan": pd.Series([1, 2, None], dtype="Int64"),
                      "Int array with nan": pd.array([1, 2, None], dtype="Int64"),
                      "string series": pd.Series(["a", "b", "a"], dtype="string"),
                      "string array": pd.array(["a", "b", "a"], dtype="string"),
                      "string series with nan": pd.Series(["a", "b", None], dtype="string"),
                      "string array with nan": pd.array(["a", "b", None], dtype="string"),
                      "boolean series": pd.Series([True, False, True], dtype="boolean"),
                      "boolean array": pd.array([True, False, True], dtype="boolean"),
                      "boolean series with nan": pd.Series([True, False, None], dtype="boolean"),
                      "boolean array with nan": pd.array([True, False, None], dtype="boolean")
                      })
    X_expected = pd.DataFrame({"Int series": pd.Series([1, 2, 3], dtype="int64"),
                               "Int array": pd.array([1, 2, 3], dtype="int64"),
                               "Int series with nan": pd.Series([1, 2, np.nan], dtype="float64"),
                               "Int array with nan": pd.array([1, 2, np.nan], dtype="float64"),
                               "string series": pd.Series(["a", "b", "a"], dtype="object"),
                               "string array": pd.array(["a", "b", "a"], dtype="object"),
                               "string series with nan": pd.Series(["a", "b", np.nan], dtype="object"),
                               "string array with nan": pd.array(["a", "b", np.nan], dtype="object"),
                               "boolean series": pd.Series([True, False, True], dtype="bool"),
                               "boolean array": pd.array([True, False, True], dtype="bool"),
                               "boolean series with nan": pd.Series([True, False, np.nan], dtype="object"),
                               "boolean array with nan": pd.array([True, False, np.nan], dtype="object")
                               })
    pd.testing.assert_frame_equal(X_expected, _convert_woodwork_types_wrapper(X))
示例#26
0
    def transform(self, X, y=None):
        """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are
            treated as the same.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            pd.DataFrame: Transformed X
        """
        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        # Convert None to np.nan, since None cannot be properly handled
        X = X.fillna(value=np.nan)

        # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
        if (X.dtypes == bool).all():
            return X
        X_null_dropped = X.copy()
        X_null_dropped.drop(self._all_null_cols, axis=1, errors='ignore', inplace=True)
        category_cols = X_null_dropped.select_dtypes(include=['category']).columns
        X_t = self._component_obj.transform(X)
        if X_null_dropped.empty:
            return pd.DataFrame(X_t, columns=X_null_dropped.columns)
        X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns)
        if len(category_cols) > 0:
            X_t[category_cols] = X_t[category_cols].astype('category')
        return X_t
示例#27
0
    def validate(self, X, y=None):
        """Checks if there are any columns in the input that are too unique in the case of classification
        problems or not unique enough in the case of regression problems.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features.
            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.  Defaults to None.

        Returns:
            dict: dict with a DataCheckWarning if there are any too unique or not
                unique enough columns.

        Example:
            >>> import pandas as pd
            >>> df = pd.DataFrame({
            ...    'regression_unique_enough': [float(x) for x in range(100)],
            ...    'regression_not_unique_enough': [float(1) for x in range(100)]
            ... })
            >>> uniqueness_check = UniquenessDataCheck(problem_type="regression", threshold=0.8)
            >>> assert uniqueness_check.validate(df) == {"errors": [],\
                                                         "warnings": [{"message": "Input columns (regression_not_unique_enough) for regression problem type are not unique enough.",\
                                                                 "data_check_name": "UniquenessDataCheck",\
                                                                 "level": "warning",\
                                                                 "code": "NOT_UNIQUE_ENOUGH",\
                                                                 "details": {"column": "regression_not_unique_enough", 'uniqueness_score': 0.0}}]}
        """
        messages = {"warnings": [], "errors": []}

        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        res = X.apply(UniquenessDataCheck.uniqueness_score)

        if is_regression(self.problem_type):
            not_unique_enough_cols = list(res.index[res < self.threshold])
            messages["warnings"].extend([
                DataCheckWarning(
                    message=warning_not_unique_enough.format(
                        col_name, self.problem_type),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH,
                    details={
                        "column": col_name,
                        "uniqueness_score": res.loc[col_name]
                    }).to_dict() for col_name in not_unique_enough_cols
            ])
        elif is_multiclass(self.problem_type):
            too_unique_cols = list(res.index[res > self.threshold])
            messages["warnings"].extend([
                DataCheckWarning(message=warning_too_unique.format(
                    col_name, self.problem_type),
                                 data_check_name=self.name,
                                 message_code=DataCheckMessageCode.TOO_UNIQUE,
                                 details={
                                     "column": col_name,
                                     "uniqueness_score": res.loc[col_name]
                                 }).to_dict() for col_name in too_unique_cols
            ])
        return messages
    def fit(self, X, y=None):
        if X is None:
            X = pd.DataFrame()
        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        self._num_features = X.shape[1]
        return self
示例#29
0
 def _encode_labels(self, y):
     y_encoded = _convert_to_woodwork_structure(y)
     y_encoded = _convert_woodwork_types_wrapper(y_encoded.to_series())
     # change only if dtype isn't int
     if not is_integer_dtype(y_encoded):
         self._label_encoder = LabelEncoder()
         y_encoded = pd.Series(self._label_encoder.fit_transform(y_encoded), dtype='int64')
     return y_encoded
示例#30
0
    def fit(self, X, y=None):
        if y is None:
            raise ValueError("Cannot fit Baseline classifier if y is None")
        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_to_woodwork_structure(y)
        y = _convert_woodwork_types_wrapper(y.to_series())

        vals, counts = np.unique(y, return_counts=True)
        self._classes = list(vals)
        self._percentage_freq = counts.astype(float) / len(y)
        self._num_unique = len(self._classes)
        self._num_features = X.shape[1]

        if self.parameters["strategy"] == "mode":
            self._mode = y.mode()[0]
        return self