Exemplo n.º 1
0
    def test_reverse_transform_single(self):
        # Setup
        ohet = OneHotEncodingTransformer()
        data = pd.Series(['a', 'a', 'a'])
        ohet.fit(data)

        # Run
        transformed = np.array([[1], [1], [1]])
        out = ohet.reverse_transform(transformed)

        # Assert
        expected = pd.Series(['a', 'a', 'a'])
        pd.testing.assert_series_equal(out, expected)
Exemplo n.º 2
0
    def _fit_discrete(self, column_name, raw_column_data):
        """Fit one hot encoder for discrete column."""
        ohe = OneHotEncodingTransformer()
        ohe.fit(raw_column_data)
        num_categories = len(ohe.dummies)

        return ColumnTransformInfo(
            column_name=column_name,
            column_type="discrete",
            transform=ohe,
            transform_aux=None,
            output_info=[SpanInfo(num_categories, 'softmax')],
            output_dimensions=num_categories)
Exemplo n.º 3
0
    def test__transform_nans_categorical(self):
        """Test the ``_transform`` method with nans.

        The values passed to ``_transform`` should be
        returned in a one-hot encoding representation using
        the categorical branch. Null values should be
        represented by the same encoding.

        Input:
        - Series with categorical values containing nans
        Output:
        - one-hot encoding of the input
        """
        # Setup
        ohet = OneHotEncodingTransformer()
        data = pd.Series([np.nan, None, 'a', 'b'])
        ohet.dummies = ['a', 'b']
        ohet.indexer = [0, 1]
        ohet.dummy_na = True
        ohet.num_dummies = 2
        ohet.dummy_encoded = True

        # Run
        out = ohet._transform(data)

        # Assert
        expected = np.array([[0, 0, 1], [0, 0, 1], [1, 0, 0], [0, 1, 0]])
        np.testing.assert_array_equal(out, expected)
Exemplo n.º 4
0
    def test__transform_zeros_categorical(self):
        """Test the ``_transform`` with unknown category.

        The values passed to ``_transform`` should be
        returned in a one-hot encoding representation
        using the categorical branch where it should
        be a column of zeros.

        Input:
        - Series with categorical and unknown values
        Output:
        - one-hot encoding of the input
        """
        # Setup
        ohet = OneHotEncodingTransformer()
        pd.Series(['a'])
        ohet.dummies = ['a']
        ohet.indexer = [0]
        ohet.num_dummies = 1
        ohet.dummy_encoded = True

        # Run
        out = ohet._transform(pd.Series(['b', 'b', 'b']))

        # Assert
        expected = np.array([[0], [0], [0]])
        np.testing.assert_array_equal(out, expected)
Exemplo n.º 5
0
    def test__transform_single_categorical(self):
        """Test the ``_transform`` with one category.

        The values passed to ``_transform`` should be
        returned in a one-hot encoding representation
        using the categorical branch where it should
        be a single column.

        Input:
        - Series with a single category
        Output:
        - one-hot encoding of the input
        """
        # Setup
        ohet = OneHotEncodingTransformer()
        data = pd.Series(['a', 'a', 'a'])
        ohet.dummies = ['a']
        ohet.indexer = [0]
        ohet.num_dummies = 1
        ohet.dummy_encoded = True

        # Run
        out = ohet._transform(data)

        # Assert
        expected = np.array([[1], [1], [1]])
        np.testing.assert_array_equal(out, expected)
Exemplo n.º 6
0
def test_single_category():
    ht = HyperTransformer(transformers={'a': OneHotEncodingTransformer()})
    data = pd.DataFrame({'a': ['a', 'a', 'a']})

    ht.fit(data)
    transformed = ht.transform(data)

    reverse = ht.reverse_transform(transformed)

    pd.testing.assert_frame_equal(data, reverse)
Exemplo n.º 7
0
    def test__transform_unknown_nan(self):
        """Test the ``_transform`` with unknown and nans.

        This is an edge case for ``_transform`` where
        unknowns should be zeros and nans should be
        the last entry in the column.

        Input:
        - Series with unknown and nans
        Output:
        - one-hot encoding of the input
        """
        # Setup
        ohet = OneHotEncodingTransformer()
        pd.Series(['a'])
        ohet.dummies = ['a']
        ohet.dummy_na = True
        ohet.num_dummies = 1

        # Run
        out = ohet._transform(pd.Series(['b', 'b', np.nan]))

        # Assert
        expected = np.array([[0, 0], [0, 0], [0, 1]])
        np.testing.assert_array_equal(out, expected)
Exemplo n.º 8
0
    def test_transform_nans(self):
        """Test the ``transform`` with nans.

        In this test ``transform`` should return an identity matrix
        representing each item in the input as well as nans.

        Input:
        - Series with categorical values and nans
        Output:
        - one-hot encoding of the input
        """
        # Setup
        ohet = OneHotEncodingTransformer()
        data = pd.Series(['a', 'b', None])
        ohet.fit(data)

        # Run
        out = ohet.transform(data)

        # Assert
        expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
        np.testing.assert_array_equal(out, expected)
Exemplo n.º 9
0
    def _fit_discrete(self, data):
        """Fit one hot encoder for discrete column.

        Args:
            data (pd.DataFrame):
                A dataframe containing a column.

        Returns:
            namedtuple:
                A ``ColumnTransformInfo`` object.
        """
        column_name = data.columns[0]
        ohe = OneHotEncodingTransformer()
        ohe.fit(data, [column_name])
        num_categories = len(ohe.dummies)

        return ColumnTransformInfo(
            column_name=column_name,
            column_type='discrete',
            transform=ohe,
            output_info=[SpanInfo(num_categories, 'softmax')],
            output_dimensions=num_categories)
Exemplo n.º 10
0
    def test_transform_single(self):
        """Test the ``transform`` on a single category.

        In this test ``transform`` should return a column
        filled with ones.

        Input:
        - Series with a single categorical value
        Output:
        - one-hot encoding of the input
        """
        # Setup
        ohet = OneHotEncodingTransformer()
        data = pd.Series(['a', 'a', 'a'])
        ohet.fit(data)

        # Run
        out = ohet.transform(data)

        # Assert
        expected = np.array([[1], [1], [1]])
        np.testing.assert_array_equal(out, expected)
Exemplo n.º 11
0
    def test_fit_nans_numeric(self):
        """Test the ``fit`` method with nans.

        Check that the settings of the transformer
        are properly set based on the input. Encoding
        should be deactivated and NA activated.

        Input:
        - Series with containing nan values
        """

        # Setup
        ohet = OneHotEncodingTransformer()

        # Run
        data = pd.Series([1, 2, np.nan])
        ohet.fit(data)

        # Assert
        np.testing.assert_array_equal(ohet.dummies, [1, 2])
        np.testing.assert_array_equal(ohet.decoder, [1, 2, np.nan])
        assert not ohet.dummy_encoded
        assert ohet.dummy_na
Exemplo n.º 12
0
    def test_fit_no_nans(self):
        """Test the ``fit`` method without nans.

        Check that the settings of the transformer
        are properly set based on the input. Encoding
        should be activated

        Input:
        - Series with values
        """

        # Setup
        ohet = OneHotEncodingTransformer()

        # Run
        data = pd.Series(['a', 'b', 'c'])
        ohet.fit(data)

        # Assert
        np.testing.assert_array_equal(ohet.dummies, ['a', 'b', 'c'])
        np.testing.assert_array_equal(ohet.decoder, ['a', 'b', 'c'])
        assert ohet.dummy_encoded
        assert not ohet.dummy_na
Exemplo n.º 13
0
    def test__transform_no_nan(self):
        """Test the ``_transform`` method without nans.

        The values passed to ``_transform`` should be
        returned in a one-hot encoding representation.

        Input:
        - Series with values
        Output:
        - one-hot encoding of the input
        """
        # Setup
        ohet = OneHotEncodingTransformer()
        data = pd.Series(['a', 'b', 'c'])
        ohet.dummies = ['a', 'b', 'c']
        ohet.num_dummies = 3

        # Run
        out = ohet._transform(data)

        # Assert
        expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
        np.testing.assert_array_equal(out, expected)
Exemplo n.º 14
0
    def compute(cls, real_data, synthetic_data, metadata=None):
        """Compute this metric.

        This builds a Machine Learning Classifier that learns to tell the synthetic
        data apart from the real data, which later on is evaluated using Cross Validation.

        The output of the metric is one minus the average ROC AUC score obtained.

        Args:
            real_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the real dataset.
            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict. If not passed, it is build based on the
                real_data fields and dtypes.

        Returns:
            float:
                One minus the ROC AUC Cross Validation Score obtained by the classifier.
        """
        metadata = cls._validate_inputs(real_data, synthetic_data, metadata)
        transformer = HyperTransformer(
            default_data_type_transformers={
                'categorical': OneHotEncodingTransformer(
                    error_on_unknown=False),
            })
        real_data = transformer.fit_transform(real_data).to_numpy()
        synthetic_data = transformer.transform(synthetic_data).to_numpy()

        X = np.concatenate([real_data, synthetic_data])
        y = np.hstack([np.ones(len(real_data)), np.zeros(len(synthetic_data))])
        if np.isin(X, [np.inf, -np.inf]).any():
            X[np.isin(X, [np.inf, -np.inf])] = np.nan

        try:
            scores = []
            kf = StratifiedKFold(n_splits=3, shuffle=True)
            for train_index, test_index in kf.split(X, y):
                y_pred = cls._fit_predict(X[train_index], y[train_index],
                                          X[test_index])
                roc_auc = roc_auc_score(y[test_index], y_pred)

                scores.append(max(0.5, roc_auc) * 2 - 1)

            return 1 - np.mean(scores)
        except ValueError as err:
            raise IncomputableMetricError(
                f'DetectionMetric: Unable to be fit with error {err}')
Exemplo n.º 15
0
def test_one_hot_numerical_nans():
    """Ensure OneHotEncodingTransformer works on numerical + nan only columns."""

    data = pd.Series([1, 2, float('nan'), np.nan])

    transformer = OneHotEncodingTransformer()
    transformer.fit(data)
    transformed = transformer.transform(data)
    reverse = transformer.reverse_transform(transformed)

    pd.testing.assert_series_equal(reverse, data)
Exemplo n.º 16
0
    def test_transform_unknown(self):
        """Test the ``transform`` with unknown data.

        In this test ``transform`` should raise an error
        due to the attempt of transforming data with previously
        unseen categories.

        Input:
        - Series with unknown categorical values
        """
        # Setup
        ohet = OneHotEncodingTransformer()
        data = pd.Series(['a'])
        ohet.fit(data)

        # Assert
        with np.testing.assert_raises(ValueError):
            ohet.transform(['b'])
Exemplo n.º 17
0
class HyperTransformer:
    """HyperTransformer class.

    The ``HyperTransformer`` class contains a collection of ``transformers`` that can be
    used to transform and reverse transform one or more columns at once.

    Args:
        transformers (dict or None):
            dict associating column names with transformers, which can be either passed
            directly as an instance or as a dict specification. If ``None``, a simple
            ``transformers`` dict is built automatically from the data.
        copy (bool):
            Whether to make a copy of the input data or not. Defaults to ``True``.
        anonymize (dict or None):
            Dictionary specifying the names and ``faker`` categories of the categorical
            columns that need to be anonymized. Defaults to ``None``.
        dtypes (list or None):
            List of column data types to use when building the ``transformers`` dict
            automatically. If not passed, the ``DataFrame.dtypes`` are used.
        dtype_transformers (dict or None):
            Transformer templates to use for each dtype. Passed as a dictionary of
            dtype kinds ('i', 'f', 'O', 'b', 'M') and transformer names, classes
            or instances.

    Example:
        Create a simple ``HyperTransformer`` instance that will decide which transformers
        to use based on the fit data ``dtypes``.

        >>> ht = HyperTransformer()

        Create a ``HyperTransformer`` passing a list of dtypes.

        >>> ht = HyperTransformer(dtypes=[int, 'object', np.float64, 'datetime', 'bool'])

        Create a ``HyperTransformer`` passing a ``transformers`` dict.

        >>> transformers = {
        ...     'a': NumericalTransformer(dtype=float),
        ...     'b': {
        ...         'class': 'NumericalTransformer',
        ...         'kwargs': {
        ...             'dtype': int
        ...         }
        ...     }
        ... }
        >>> ht = HyperTransformer(transformers)
    """

    _TRANSFORMER_TEMPLATES = {
        'numerical': NumericalTransformer,
        'integer': NumericalTransformer(dtype=int),
        'float': NumericalTransformer(dtype=float),
        'categorical': CategoricalTransformer,
        'categorical_fuzzy': CategoricalTransformer(fuzzy=True),
        'one_hot_encoding': OneHotEncodingTransformer(error_on_unknown=False),
        'label_encoding': LabelEncodingTransformer,
        'boolean': BooleanTransformer,
        'datetime': DatetimeTransformer,
    }
    _DTYPE_TRANSFORMERS = {
        'i': 'numerical',
        'f': 'numerical',
        'O': 'categorical',
        'b': 'boolean',
        'M': 'datetime',
    }

    def __init__(self,
                 transformers=None,
                 copy=True,
                 anonymize=None,
                 dtypes=None,
                 dtype_transformers=None):
        self.transformers = transformers
        self._transformers = dict()
        self.copy = copy
        self.anonymize = anonymize or dict()
        self.dtypes = dtypes
        self.dtype_transformers = self._DTYPE_TRANSFORMERS.copy()
        if dtype_transformers:
            self.dtype_transformers.update(dtype_transformers)

    def _analyze(self, data):
        """Build a ``dict`` with column names and transformers from a given ``pandas.DataFrame``.

        When ``self.dtypes`` is ``None``, use the dtypes from the input data.

        When ``dtype`` is:
            - ``int``: a ``NumericalTransformer`` is created with ``dtype=int``.
            - ``float``: a ``NumericalTransformer`` is created with ``dtype=float``.
            - ``object`` or ``category``: a ``CategoricalTransformer`` is created.
            - ``bool``: a ``BooleanTransformer`` is created.
            - ``datetime``: a ``DatetimeTransformer`` is created.

        Any other ``dtype`` is not supported and raises a ``ValueError``.

        Args:
            data (pandas.DataFrame):
                Data used to analyze the ``pandas.DataFrame`` dtypes.

        Returns:
            dict:
                Mapping of column names and transformer instances.

        Raises:
            ValueError:
                if a ``dtype`` is not supported by the `HyperTransformer``.
        """
        transformers = dict()
        if self.dtypes:
            dtypes = self.dtypes
        else:
            dtypes = [
                data[column].dropna().infer_objects().dtype
                for column in data.columns
            ]

        for name, dtype in zip(data.columns, dtypes):
            try:
                kind = np.dtype(dtype).kind
            except TypeError:
                # probably category
                kind = 'O'

            transformer_template = self.dtype_transformers[kind]
            if not transformer_template:
                raise ValueError('Unsupported dtype: {}'.format(dtype))

            if isinstance(transformer_template, str):
                transformer_template = self._TRANSFORMER_TEMPLATES[
                    transformer_template]

            if not isinstance(transformer_template, type):
                transformer = deepcopy(transformer_template)
            elif self.anonymize and transformer_template == CategoricalTransformer:
                warnings.warn(
                    'Categorical anonymization is deprecated and will be removed from RDT soon.',
                    DeprecationWarning)
                transformer = CategoricalTransformer(anonymize=self.anonymize)
            else:
                transformer = transformer_template()

            transformers[name] = transformer

        return transformers

    def fit(self, data):
        """Fit the transformers to the data.

        Args:
            data (pandas.DataFrame):
                Data to fit the transformers to.
        """
        if self.transformers is not None:
            self._transformers = load_transformers(self.transformers)
        else:
            self._transformers = self._analyze(data)

        for column_name, transformer in self._transformers.items():
            column = data[column_name]
            transformer.fit(column)

    def transform(self, data):
        """Transform the data.

        If ``self.copy`` is ``True`` make a copy of the input data to avoid modifying it.

        Args:
            data (pandas.DataFrame):
                Data to transform.

        Returns:
            pandas.DataFrame:
                Transformed data.
        """
        self.column_names = []
        if self.copy:
            data = data.copy()

        for column_name, transformer in self._transformers.items():
            column = data.pop(column_name)
            transformed = transformer.transform(column)
            self.column_names.append(column_name)

            shape = transformed.shape

            if len(shape) == 2:
                for index in range(shape[1]):
                    new_column = '{}#{}'.format(column_name, index)
                    data[new_column] = transformed[:, index]

            else:
                data[column_name] = transformed

        return data

    def fit_transform(self, data):
        """Fit the transformers to the data and then transform it.

        Args:
            data (pandas.DataFrame):
                Data to transform.

        Returns:
            pandas.DataFrame:
                Transformed data.
        """
        self.fit(data)
        return self.transform(data)

    @staticmethod
    def _get_columns(data, column_name):
        """Get one or more columns that match a given name.

        Args:
            data (pandas.DataFrame):
                Table to perform the matching.
            column_name (str):
                Name to match the columns.

        Returns:
            numpy.ndarray:
                values of the matching columns

        Raises:
            ValueError:
                if no columns match.
        """
        regex = r'{}(#[0-9]+)?$'.format(re.escape(column_name))
        columns = data.columns[data.columns.str.match(regex)]
        if columns.empty:
            raise ValueError('No columns match_ {}'.format(column_name))

        values = [data.pop(column).values for column in columns]

        if len(values) == 1:
            return values[0]

        return np.column_stack(values)

    def reverse_transform(self, data):
        """Revert the transformations back to the original values.

        Args:
            data (pandas.DataFrame):
                Data to revert.

        Returns:
            pandas.DataFrame:
                reversed data.
        """
        if self.copy:
            data = data.copy()

        for column_name, transformer in self._transformers.items():
            columns = self._get_columns(data, column_name)
            data[column_name] = transformer.reverse_transform(columns)

        return data