Exemplo n.º 1
0
    def transform(self, X, y=None):
        """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are
            treated as the same.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X_ww = infer_feature_types(X)
        X_null_dropped = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        X_null_dropped.drop(self._all_null_cols,
                            inplace=True,
                            axis=1,
                            errors='ignore')
        if X_null_dropped.empty:
            return _retain_custom_types_and_initalize_woodwork(
                X_ww, X_null_dropped)

        if self._numeric_cols is not None and len(self._numeric_cols) > 0:
            X_numeric = X_null_dropped[self._numeric_cols]
            imputed = self._numeric_imputer.transform(X_numeric).to_dataframe()
            X_null_dropped[X_numeric.columns] = imputed

        if self._categorical_cols is not None and len(
                self._categorical_cols) > 0:
            X_categorical = X_null_dropped[self._categorical_cols]
            imputed = self._categorical_imputer.transform(
                X_categorical).to_dataframe()
            X_null_dropped[X_categorical.columns] = imputed
        X_null_dropped = _retain_custom_types_and_initalize_woodwork(
            X_ww, X_null_dropped)
        return X_null_dropped
Exemplo n.º 2
0
    def transform(self, X, y):
        """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Features. Ignored.
            y (ww.DataColumn, pd.Series): Target data to impute.

        Returns:
            (ww.DataTable, ww.DataColumn): The original X, transformed y
        """

        if X is not None:
            X = infer_feature_types(X)
        if y is None:
            return X, None
        y_ww = infer_feature_types(y)
        y = _convert_woodwork_types_wrapper(y_ww.to_series())
        y_df = y.to_frame()

        # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
        if (y_df.dtypes == bool).all():
            return X, _retain_custom_types_and_initalize_woodwork(y_ww, y)

        transformed = self._component_obj.transform(y_df)
        if transformed.shape[1] == 0:
            raise RuntimeError("Transformed data is empty")
        y_t = pd.Series(transformed[:, 0], index=y.index)
        return X, _retain_custom_types_and_initalize_woodwork(y_ww, y_t)
Exemplo n.º 3
0
    def transform(self, X, y=None):
        """Transforms input by imputing missing values. 'None' and np.nan values are treated as the same.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X_ww = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())

        # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
        if (X.dtypes == bool).all():
            return infer_feature_types(X)

        X_null_dropped = X.copy()
        X_null_dropped.drop(self._all_null_cols,
                            axis=1,
                            errors='ignore',
                            inplace=True)
        X_t = self._component_obj.transform(X)
        if X_null_dropped.empty:
            X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns)
            return infer_feature_types(X_t)

        X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns)
        X_t.index = X_null_dropped.index
        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
Exemplo n.º 4
0
Arquivo: lsa.py Projeto: sujala/evalml
    def transform(self, X, y=None):
        """Transforms data X by applying the LSA pipeline.

        Arguments:
            X (ww.DataTable, pd.DataFrame): The data to transform.
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X. The original column is removed and replaced with two columns of the
                          format `LSA(original_column_name)[feature_number]`, where `feature_number` is 0 or 1.
        """
        X_ww = infer_feature_types(X)
        if len(self._text_columns) == 0:
            return X_ww

        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        X_t = X.copy()
        provenance = {}
        for col in self._text_columns:
            transformed = self._lsa_pipeline.transform(X[col])
            X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0],
                                                      index=X.index)
            X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1],
                                                      index=X.index)
            provenance[col] = [
                'LSA({})[0]'.format(col), 'LSA({})[1]'.format(col)
            ]
        self._provenance = provenance

        X_t = X_t.drop(columns=self._text_columns)
        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
Exemplo n.º 5
0
    def transform(self, X, y=None):
        """One-hot encode the input data.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Features to one-hot encode.
            y (ww.DataColumn, pd.Series): Ignored.

        Returns:
            ww.DataTable: Transformed data, where each categorical feature has been encoded into numerical columns using one-hot encoding.
        """
        X_ww = infer_feature_types(X)
        X_copy = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        X_copy = self._handle_parameter_handle_missing(X_copy)

        X_t = pd.DataFrame()
        # Add the non-categorical columns, untouched
        for col in X_copy.columns:
            if col not in self.features_to_encode:
                X_t = pd.concat([X_t, X_copy[col]], axis=1)
        # The call to pd.concat above changes the type of the index so we will manually keep it the same.
        if not X_t.empty:
            X_t.index = X_copy.index

        # Call sklearn's transform on the categorical columns
        if len(self.features_to_encode) > 0:
            X_cat = pd.DataFrame(self._encoder.transform(
                X_copy[self.features_to_encode]).toarray(),
                                 index=X_copy.index)
            X_cat.columns = self._get_feature_names()
            X_t = pd.concat([X_t, X_cat], axis=1)
            X_t = X_t.drop(columns=self._features_to_drop)
            self._feature_names = X_t.columns

        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
Exemplo n.º 6
0
 def transform(self, X, y=None):
     X_ww = infer_feature_types(X)
     X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
     X_t = self._component_obj.transform(X)
     X_t_df = pd.DataFrame(X_t, columns=X.columns, index=X.index)
     return _retain_custom_types_and_initalize_woodwork(
         X_ww, X_t_df, ltypes_to_ignore=[Integer, Categorical])
Exemplo n.º 7
0
    def transform(self, X, y=None):
        """Transforms data X by creating new features using existing text columns

        Arguments:
            X (ww.DataTable, pd.DataFrame): The data to transform.
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X_ww = infer_feature_types(X)
        if self._features is None or len(self._features) == 0:
            return X_ww
        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        es = self._make_entity_set(X, self._text_columns)
        X_nlp_primitives = ft.calculate_feature_matrix(features=self._features,
                                                       entityset=es)
        if X_nlp_primitives.isnull().any().any():
            X_nlp_primitives.fillna(0, inplace=True)

        X_lsa = self._lsa.transform(X[self._text_columns]).to_dataframe()
        X_nlp_primitives.set_index(X.index, inplace=True)
        X_t = pd.concat(
            [X.drop(self._text_columns, axis=1), X_nlp_primitives, X_lsa],
            axis=1)
        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
Exemplo n.º 8
0
    def transform(self, X, y=None):
        """Transforms data X by creating new features using existing DateTime columns, and then dropping those DateTime columns

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X_ww = infer_feature_types(X)
        X_t = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        features_to_extract = self.parameters["features_to_extract"]
        if len(features_to_extract) == 0:
            return infer_feature_types(X_t)
        for col_name in self._date_time_col_names:
            for feature in features_to_extract:
                name = f"{col_name}_{feature}"
                features, categories = self._function_mappings[feature](
                    X_t[col_name], self.encode_as_categories)
                X_t[name] = features
                if categories:
                    self._categories[name] = categories
        X_t = X_t.drop(self._date_time_col_names, axis=1)
        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
Exemplo n.º 9
0
 def transform(self, X, y=None):
     X_ww = infer_feature_types(X)
     if not is_all_numeric(X_ww):
         raise ValueError("LDA input must be all numeric")
     X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
     X_t = self._component_obj.transform(X)
     X_t = pd.DataFrame(X_t, index=X.index, columns=[f"component_{i}" for i in range(X_t.shape[1])])
     return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
    def transform(self, X, y=None):
        """Computes the delayed features for all features in X and y.

        For each feature in X, it will add a column to the output dataframe for each
        delay in the (inclusive) range [1, max_delay]. The values of each delayed feature are simply the original
        feature shifted forward in time by the delay amount. For example, a delay of 3 units means that the feature
        value at row n will be taken from the n-3rd row of that feature

        If y is not None, it will also compute the delayed values for the target variable.

        Arguments:
            X (ww.DataTable, pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used.
            y (ww.DataColumn, pd.Series, or None): Target.

        Returns:
            ww.DataTable: Transformed X.
        """
        if X is None:
            X = pd.DataFrame()
        # Normalize the data into pandas objects
        X_ww = infer_feature_types(X)
        categorical_columns = self._get_categorical_columns(X_ww)
        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())

        if self.delay_features and len(X) > 0:
            X_categorical = self._encode_X_while_preserving_index(
                X[categorical_columns])
            for col_name in X:
                col = X[col_name]
                if col_name in categorical_columns:
                    col = X_categorical[col_name]
                X = X.assign(
                    **{
                        f"{col_name}_delay_{t}": col.shift(t)
                        for t in range(1, self.max_delay + 1)
                    })

        # Handle cases where the target was passed in
        if self.delay_target and y is not None:
            y = infer_feature_types(y)
            if y.logical_type == logical_types.Categorical:
                y = self._encode_y_while_preserving_index(y)
            else:
                y = _convert_woodwork_types_wrapper(y.to_series())
            X = X.assign(
                **{
                    f"target_delay_{t}": y.shift(t)
                    for t in range(self.start_delay_for_target,
                                   self.max_delay + 1)
                })

        return _retain_custom_types_and_initalize_woodwork(X_ww, X)
Exemplo n.º 11
0
    def transform(self, X, y=None):
        """Computes the feature matrix for the input X using featuretools' dfs algorithm.

        Arguments:
            X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data to transform. Has shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Feature matrix
        """
        X_ww = infer_feature_types(X)
        X_t = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        X_t.columns = X_t.columns.astype(str)
        es = self._make_entity_set(X_t)
        feature_matrix = calculate_feature_matrix(features=self.features, entityset=es)
        return _retain_custom_types_and_initalize_woodwork(X_ww, feature_matrix)
Exemplo n.º 12
0
    def transform(self, X, y=None):
        """Transforms input data by imputing missing values.

        Arguments:
            X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to transform.
            y (ww.DataColumn, pd.Series, optional): The target training data of length [n_samples]. Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X_ww = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        X_t = X.copy()
        cols_to_drop = []
        for column, imputer in self.imputers.items():
            transformed = imputer.transform(X[[column]]).to_dataframe()
            if transformed.empty:
                cols_to_drop.append(column)
            else:
                X_t[column] = transformed[column]
        X_t = X_t.drop(cols_to_drop, axis=1)
        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
Exemplo n.º 13
0
    def fit_transform(self, X, y=None):
        """Fits on X and transforms X

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to fit and transform
            y (ww.DataColumn, pd.Series): Target data

        Returns:
            ww.DataTable: Transformed X
        """
        X_ww = infer_feature_types(X)
        X_pd = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        if y is not None:
            y_ww = infer_feature_types(y)
            y_pd = _convert_woodwork_types_wrapper(y_ww.to_series())
        try:
            X_t = self._component_obj.fit_transform(X_pd, y_pd)
            return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
        except AttributeError:
            try:
                return self.fit(X, y).transform(X, y)
            except MethodPropertyNotFoundError as e:
                raise e
Exemplo n.º 14
0
    def transform(self, X, y=None):
        """Transforms data X.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform.
            y (ww.DataColumn, pd.Series, optional): Target data.

        Returns:
            ww.DataTable: Transformed X
        """
        X_ww = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        if y is not None:
            y = infer_feature_types(y)
            y = _convert_woodwork_types_wrapper(y.to_series())
        try:
            X_t = self._component_obj.transform(X, y)
        except AttributeError:
            raise MethodPropertyNotFoundError(
                "Transformer requires a transform method or a component_obj that implements transform"
            )
        X_t_df = pd.DataFrame(X_t, columns=X.columns, index=X.index)
        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t_df)
Exemplo n.º 15
0
    def transform(self, X, y=None):
        """Transforms input data by selecting features. If the component_obj does not have a transform method, will raise an MethodPropertyNotFoundError exception.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform.
            y (ww.DataColumn, pd.Series, optional): Target data. Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X_ww = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        self.input_feature_names = list(X.columns.values)

        try:
            X_t = self._component_obj.transform(X)
        except AttributeError:
            raise MethodPropertyNotFoundError("Feature selector requires a transform method or a component_obj that implements transform")

        X_dtypes = X.dtypes.to_dict()
        selected_col_names = self.get_names()
        col_types = {key: X_dtypes[key] for key in selected_col_names}
        features = pd.DataFrame(X_t, columns=selected_col_names, index=X.index).astype(col_types)
        return _retain_custom_types_and_initalize_woodwork(X_ww, features)