def test_reverse_transform_single(self): # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a', 'a', 'a']) ohet.fit(data) # Run transformed = np.array([[1], [1], [1]]) out = ohet.reverse_transform(transformed) # Assert expected = pd.Series(['a', 'a', 'a']) pd.testing.assert_series_equal(out, expected)
def _fit_discrete(self, column_name, raw_column_data): """Fit one hot encoder for discrete column.""" ohe = OneHotEncodingTransformer() ohe.fit(raw_column_data) num_categories = len(ohe.dummies) return ColumnTransformInfo( column_name=column_name, column_type="discrete", transform=ohe, transform_aux=None, output_info=[SpanInfo(num_categories, 'softmax')], output_dimensions=num_categories)
def test__transform_nans_categorical(self): """Test the ``_transform`` method with nans. The values passed to ``_transform`` should be returned in a one-hot encoding representation using the categorical branch. Null values should be represented by the same encoding. Input: - Series with categorical values containing nans Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() data = pd.Series([np.nan, None, 'a', 'b']) ohet.dummies = ['a', 'b'] ohet.indexer = [0, 1] ohet.dummy_na = True ohet.num_dummies = 2 ohet.dummy_encoded = True # Run out = ohet._transform(data) # Assert expected = np.array([[0, 0, 1], [0, 0, 1], [1, 0, 0], [0, 1, 0]]) np.testing.assert_array_equal(out, expected)
def test__transform_zeros_categorical(self): """Test the ``_transform`` with unknown category. The values passed to ``_transform`` should be returned in a one-hot encoding representation using the categorical branch where it should be a column of zeros. Input: - Series with categorical and unknown values Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() pd.Series(['a']) ohet.dummies = ['a'] ohet.indexer = [0] ohet.num_dummies = 1 ohet.dummy_encoded = True # Run out = ohet._transform(pd.Series(['b', 'b', 'b'])) # Assert expected = np.array([[0], [0], [0]]) np.testing.assert_array_equal(out, expected)
def test__transform_single_categorical(self): """Test the ``_transform`` with one category. The values passed to ``_transform`` should be returned in a one-hot encoding representation using the categorical branch where it should be a single column. Input: - Series with a single category Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a', 'a', 'a']) ohet.dummies = ['a'] ohet.indexer = [0] ohet.num_dummies = 1 ohet.dummy_encoded = True # Run out = ohet._transform(data) # Assert expected = np.array([[1], [1], [1]]) np.testing.assert_array_equal(out, expected)
def test_single_category(): ht = HyperTransformer(transformers={'a': OneHotEncodingTransformer()}) data = pd.DataFrame({'a': ['a', 'a', 'a']}) ht.fit(data) transformed = ht.transform(data) reverse = ht.reverse_transform(transformed) pd.testing.assert_frame_equal(data, reverse)
def test__transform_unknown_nan(self): """Test the ``_transform`` with unknown and nans. This is an edge case for ``_transform`` where unknowns should be zeros and nans should be the last entry in the column. Input: - Series with unknown and nans Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() pd.Series(['a']) ohet.dummies = ['a'] ohet.dummy_na = True ohet.num_dummies = 1 # Run out = ohet._transform(pd.Series(['b', 'b', np.nan])) # Assert expected = np.array([[0, 0], [0, 0], [0, 1]]) np.testing.assert_array_equal(out, expected)
def test_transform_nans(self): """Test the ``transform`` with nans. In this test ``transform`` should return an identity matrix representing each item in the input as well as nans. Input: - Series with categorical values and nans Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a', 'b', None]) ohet.fit(data) # Run out = ohet.transform(data) # Assert expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) np.testing.assert_array_equal(out, expected)
def _fit_discrete(self, data): """Fit one hot encoder for discrete column. Args: data (pd.DataFrame): A dataframe containing a column. Returns: namedtuple: A ``ColumnTransformInfo`` object. """ column_name = data.columns[0] ohe = OneHotEncodingTransformer() ohe.fit(data, [column_name]) num_categories = len(ohe.dummies) return ColumnTransformInfo( column_name=column_name, column_type='discrete', transform=ohe, output_info=[SpanInfo(num_categories, 'softmax')], output_dimensions=num_categories)
def test_transform_single(self): """Test the ``transform`` on a single category. In this test ``transform`` should return a column filled with ones. Input: - Series with a single categorical value Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a', 'a', 'a']) ohet.fit(data) # Run out = ohet.transform(data) # Assert expected = np.array([[1], [1], [1]]) np.testing.assert_array_equal(out, expected)
def test_fit_nans_numeric(self): """Test the ``fit`` method with nans. Check that the settings of the transformer are properly set based on the input. Encoding should be deactivated and NA activated. Input: - Series with containing nan values """ # Setup ohet = OneHotEncodingTransformer() # Run data = pd.Series([1, 2, np.nan]) ohet.fit(data) # Assert np.testing.assert_array_equal(ohet.dummies, [1, 2]) np.testing.assert_array_equal(ohet.decoder, [1, 2, np.nan]) assert not ohet.dummy_encoded assert ohet.dummy_na
def test_fit_no_nans(self): """Test the ``fit`` method without nans. Check that the settings of the transformer are properly set based on the input. Encoding should be activated Input: - Series with values """ # Setup ohet = OneHotEncodingTransformer() # Run data = pd.Series(['a', 'b', 'c']) ohet.fit(data) # Assert np.testing.assert_array_equal(ohet.dummies, ['a', 'b', 'c']) np.testing.assert_array_equal(ohet.decoder, ['a', 'b', 'c']) assert ohet.dummy_encoded assert not ohet.dummy_na
def test__transform_no_nan(self): """Test the ``_transform`` method without nans. The values passed to ``_transform`` should be returned in a one-hot encoding representation. Input: - Series with values Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a', 'b', 'c']) ohet.dummies = ['a', 'b', 'c'] ohet.num_dummies = 3 # Run out = ohet._transform(data) # Assert expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) np.testing.assert_array_equal(out, expected)
def compute(cls, real_data, synthetic_data, metadata=None): """Compute this metric. This builds a Machine Learning Classifier that learns to tell the synthetic data apart from the real data, which later on is evaluated using Cross Validation. The output of the metric is one minus the average ROC AUC score obtained. Args: real_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the real dataset. synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the synthetic dataset. metadata (dict): Table metadata dict. If not passed, it is build based on the real_data fields and dtypes. Returns: float: One minus the ROC AUC Cross Validation Score obtained by the classifier. """ metadata = cls._validate_inputs(real_data, synthetic_data, metadata) transformer = HyperTransformer( default_data_type_transformers={ 'categorical': OneHotEncodingTransformer( error_on_unknown=False), }) real_data = transformer.fit_transform(real_data).to_numpy() synthetic_data = transformer.transform(synthetic_data).to_numpy() X = np.concatenate([real_data, synthetic_data]) y = np.hstack([np.ones(len(real_data)), np.zeros(len(synthetic_data))]) if np.isin(X, [np.inf, -np.inf]).any(): X[np.isin(X, [np.inf, -np.inf])] = np.nan try: scores = [] kf = StratifiedKFold(n_splits=3, shuffle=True) for train_index, test_index in kf.split(X, y): y_pred = cls._fit_predict(X[train_index], y[train_index], X[test_index]) roc_auc = roc_auc_score(y[test_index], y_pred) scores.append(max(0.5, roc_auc) * 2 - 1) return 1 - np.mean(scores) except ValueError as err: raise IncomputableMetricError( f'DetectionMetric: Unable to be fit with error {err}')
def test_one_hot_numerical_nans(): """Ensure OneHotEncodingTransformer works on numerical + nan only columns.""" data = pd.Series([1, 2, float('nan'), np.nan]) transformer = OneHotEncodingTransformer() transformer.fit(data) transformed = transformer.transform(data) reverse = transformer.reverse_transform(transformed) pd.testing.assert_series_equal(reverse, data)
def test_transform_unknown(self): """Test the ``transform`` with unknown data. In this test ``transform`` should raise an error due to the attempt of transforming data with previously unseen categories. Input: - Series with unknown categorical values """ # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a']) ohet.fit(data) # Assert with np.testing.assert_raises(ValueError): ohet.transform(['b'])
class HyperTransformer: """HyperTransformer class. The ``HyperTransformer`` class contains a collection of ``transformers`` that can be used to transform and reverse transform one or more columns at once. Args: transformers (dict or None): dict associating column names with transformers, which can be either passed directly as an instance or as a dict specification. If ``None``, a simple ``transformers`` dict is built automatically from the data. copy (bool): Whether to make a copy of the input data or not. Defaults to ``True``. anonymize (dict or None): Dictionary specifying the names and ``faker`` categories of the categorical columns that need to be anonymized. Defaults to ``None``. dtypes (list or None): List of column data types to use when building the ``transformers`` dict automatically. If not passed, the ``DataFrame.dtypes`` are used. dtype_transformers (dict or None): Transformer templates to use for each dtype. Passed as a dictionary of dtype kinds ('i', 'f', 'O', 'b', 'M') and transformer names, classes or instances. Example: Create a simple ``HyperTransformer`` instance that will decide which transformers to use based on the fit data ``dtypes``. >>> ht = HyperTransformer() Create a ``HyperTransformer`` passing a list of dtypes. >>> ht = HyperTransformer(dtypes=[int, 'object', np.float64, 'datetime', 'bool']) Create a ``HyperTransformer`` passing a ``transformers`` dict. >>> transformers = { ... 'a': NumericalTransformer(dtype=float), ... 'b': { ... 'class': 'NumericalTransformer', ... 'kwargs': { ... 'dtype': int ... } ... } ... } >>> ht = HyperTransformer(transformers) """ _TRANSFORMER_TEMPLATES = { 'numerical': NumericalTransformer, 'integer': NumericalTransformer(dtype=int), 'float': NumericalTransformer(dtype=float), 'categorical': CategoricalTransformer, 'categorical_fuzzy': CategoricalTransformer(fuzzy=True), 'one_hot_encoding': OneHotEncodingTransformer(error_on_unknown=False), 'label_encoding': LabelEncodingTransformer, 'boolean': BooleanTransformer, 'datetime': DatetimeTransformer, } _DTYPE_TRANSFORMERS = { 'i': 'numerical', 'f': 'numerical', 'O': 'categorical', 'b': 'boolean', 'M': 'datetime', } def __init__(self, transformers=None, copy=True, anonymize=None, dtypes=None, dtype_transformers=None): self.transformers = transformers self._transformers = dict() self.copy = copy self.anonymize = anonymize or dict() self.dtypes = dtypes self.dtype_transformers = self._DTYPE_TRANSFORMERS.copy() if dtype_transformers: self.dtype_transformers.update(dtype_transformers) def _analyze(self, data): """Build a ``dict`` with column names and transformers from a given ``pandas.DataFrame``. When ``self.dtypes`` is ``None``, use the dtypes from the input data. When ``dtype`` is: - ``int``: a ``NumericalTransformer`` is created with ``dtype=int``. - ``float``: a ``NumericalTransformer`` is created with ``dtype=float``. - ``object`` or ``category``: a ``CategoricalTransformer`` is created. - ``bool``: a ``BooleanTransformer`` is created. - ``datetime``: a ``DatetimeTransformer`` is created. Any other ``dtype`` is not supported and raises a ``ValueError``. Args: data (pandas.DataFrame): Data used to analyze the ``pandas.DataFrame`` dtypes. Returns: dict: Mapping of column names and transformer instances. Raises: ValueError: if a ``dtype`` is not supported by the `HyperTransformer``. """ transformers = dict() if self.dtypes: dtypes = self.dtypes else: dtypes = [ data[column].dropna().infer_objects().dtype for column in data.columns ] for name, dtype in zip(data.columns, dtypes): try: kind = np.dtype(dtype).kind except TypeError: # probably category kind = 'O' transformer_template = self.dtype_transformers[kind] if not transformer_template: raise ValueError('Unsupported dtype: {}'.format(dtype)) if isinstance(transformer_template, str): transformer_template = self._TRANSFORMER_TEMPLATES[ transformer_template] if not isinstance(transformer_template, type): transformer = deepcopy(transformer_template) elif self.anonymize and transformer_template == CategoricalTransformer: warnings.warn( 'Categorical anonymization is deprecated and will be removed from RDT soon.', DeprecationWarning) transformer = CategoricalTransformer(anonymize=self.anonymize) else: transformer = transformer_template() transformers[name] = transformer return transformers def fit(self, data): """Fit the transformers to the data. Args: data (pandas.DataFrame): Data to fit the transformers to. """ if self.transformers is not None: self._transformers = load_transformers(self.transformers) else: self._transformers = self._analyze(data) for column_name, transformer in self._transformers.items(): column = data[column_name] transformer.fit(column) def transform(self, data): """Transform the data. If ``self.copy`` is ``True`` make a copy of the input data to avoid modifying it. Args: data (pandas.DataFrame): Data to transform. Returns: pandas.DataFrame: Transformed data. """ self.column_names = [] if self.copy: data = data.copy() for column_name, transformer in self._transformers.items(): column = data.pop(column_name) transformed = transformer.transform(column) self.column_names.append(column_name) shape = transformed.shape if len(shape) == 2: for index in range(shape[1]): new_column = '{}#{}'.format(column_name, index) data[new_column] = transformed[:, index] else: data[column_name] = transformed return data def fit_transform(self, data): """Fit the transformers to the data and then transform it. Args: data (pandas.DataFrame): Data to transform. Returns: pandas.DataFrame: Transformed data. """ self.fit(data) return self.transform(data) @staticmethod def _get_columns(data, column_name): """Get one or more columns that match a given name. Args: data (pandas.DataFrame): Table to perform the matching. column_name (str): Name to match the columns. Returns: numpy.ndarray: values of the matching columns Raises: ValueError: if no columns match. """ regex = r'{}(#[0-9]+)?$'.format(re.escape(column_name)) columns = data.columns[data.columns.str.match(regex)] if columns.empty: raise ValueError('No columns match_ {}'.format(column_name)) values = [data.pop(column).values for column in columns] if len(values) == 1: return values[0] return np.column_stack(values) def reverse_transform(self, data): """Revert the transformations back to the original values. Args: data (pandas.DataFrame): Data to revert. Returns: pandas.DataFrame: reversed data. """ if self.copy: data = data.copy() for column_name, transformer in self._transformers.items(): columns = self._get_columns(data, column_name) data[column_name] = transformer.reverse_transform(columns) return data