Пример #1
0
def test_hypertransformer_with_transformers(faker_mock):
    faker_mock.return_value.first_name.side_effect = [
        'Jaime', 'Cersei', 'Tywin', 'Tyrion'
    ]
    data = get_input_data()
    transformers = get_transformers()

    ht = HyperTransformer(transformers)
    ht.fit(data)
    transformed = ht.transform(data)

    expected = get_transformed_data()

    np.testing.assert_allclose(
        transformed.sort_index(axis=1).values,
        expected.sort_index(axis=1).values)

    reversed_data = ht.reverse_transform(transformed)

    original_names = data.pop('names')
    reversed_names = reversed_data.pop('names')

    pd.testing.assert_frame_equal(data.sort_index(axis=1),
                                  reversed_data.sort_index(axis=1))

    for name in original_names:
        assert name not in reversed_names
Пример #2
0
    def compute(cls, real_data, synthetic_data, metadata=None):
        """Compute this metric.

        Args:
            real_data (pandas.DataFrame):
                The values from the real dataset.
            synthetic_data (pandas.DataFrame):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict.

        Returns:
            Union[float, tuple[float]]:
                Metric output.
        """
        metadata = cls._validate_inputs(real_data, synthetic_data, metadata)
        transformer = HyperTransformer()
        fields = cls._select_fields(metadata, cls.field_types)
        real_data = transformer.fit_transform(real_data[fields])
        synthetic_data = transformer.transform(synthetic_data[fields])

        values = []
        for column_name, real_column in real_data.items():
            real_column = real_column.values
            synthetic_column = synthetic_data[column_name].values

            score = cls.single_column_metric.compute(real_column,
                                                     synthetic_column)
            values.append(score)

        return np.nanmean(values)
Пример #3
0
def test_single_category():
    ht = HyperTransformer(transformers={'a': OneHotEncodingTransformer()})
    data = pd.DataFrame({'a': ['a', 'a', 'a']})

    ht.fit(data)
    transformed = ht.transform(data)

    reverse = ht.reverse_transform(transformed)

    pd.testing.assert_frame_equal(data, reverse)
Пример #4
0
def test_dtype_category():
    df = pd.DataFrame({'a': ['a', 'b', 'c']}, dtype='category')

    ht = HyperTransformer()
    ht.fit(df)

    trans = ht.transform(df)

    rever = ht.reverse_transform(trans)

    pd.testing.assert_frame_equal(df, rever)
Пример #5
0
def test_empty_transformers_nan_data():
    """If transformers is an empty dict, do nothing."""
    data = get_input_data_with_nan()

    ht = HyperTransformer(transformers={})
    ht.fit(data)

    transformed = ht.transform(data)
    reverse = ht.reverse_transform(transformed)

    pd.testing.assert_frame_equal(data, transformed)
    pd.testing.assert_frame_equal(data, reverse)
Пример #6
0
    def compute(cls, real_data, synthetic_data, metadata=None):
        """Compute this metric.

        This builds a Machine Learning Classifier that learns to tell the synthetic
        data apart from the real data, which later on is evaluated using Cross Validation.

        The output of the metric is one minus the average ROC AUC score obtained.

        Args:
            real_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the real dataset.
            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict. If not passed, it is build based on the
                real_data fields and dtypes.

        Returns:
            float:
                One minus the ROC AUC Cross Validation Score obtained by the classifier.
        """
        metadata = cls._validate_inputs(real_data, synthetic_data, metadata)
        transformer = HyperTransformer(
            default_data_type_transformers={
                'categorical': OneHotEncodingTransformer(
                    error_on_unknown=False),
            })
        real_data = transformer.fit_transform(real_data).to_numpy()
        synthetic_data = transformer.transform(synthetic_data).to_numpy()

        X = np.concatenate([real_data, synthetic_data])
        y = np.hstack([np.ones(len(real_data)), np.zeros(len(synthetic_data))])
        if np.isin(X, [np.inf, -np.inf]).any():
            X[np.isin(X, [np.inf, -np.inf])] = np.nan

        try:
            scores = []
            kf = StratifiedKFold(n_splits=3, shuffle=True)
            for train_index, test_index in kf.split(X, y):
                y_pred = cls._fit_predict(X[train_index], y[train_index],
                                          X[test_index])
                roc_auc = roc_auc_score(y[test_index], y_pred)

                scores.append(max(0.5, roc_auc) * 2 - 1)

            return 1 - np.mean(scores)
        except ValueError as err:
            raise IncomputableMetricError(
                f'DetectionMetric: Unable to be fit with error {err}')
Пример #7
0
def test_subset_of_columns_nan_data():
    """HyperTransform should be able to transform a subset of the training columns.

    See https://github.com/sdv-dev/RDT/issues/152
    """
    data = get_input_data_with_nan()

    ht = HyperTransformer()
    ht.fit(data)

    subset = data[[data.columns[0]]]
    transformed = ht.transform(subset)
    reverse = ht.reverse_transform(transformed)

    pd.testing.assert_frame_equal(subset, reverse)
Пример #8
0
    def _compute_auroc(self, real_table, synthetic_table):
        transformer = HyperTransformer()
        real_table = transformer.fit_transform(real_table).values
        synthetic_table = transformer.transform(synthetic_table).values

        X = np.concatenate([real_table, synthetic_table])
        y = np.hstack([np.ones(len(real_table)), np.zeros(len(synthetic_table))])
        X[np.isnan(X)] = 0.0

        if len(X) < 20:
            warnings.warn("Not enough data, skipping the detection tests.")

        scores = []
        kf = StratifiedKFold(n_splits=3, shuffle=True)
        for train_index, test_index in kf.split(X, y):
            self.fit(X[train_index], y[train_index])
            y_pred = self.predict_proba(X[test_index])
            auroc = roc_auc_score(y[test_index], y_pred)
            if auroc < 0.5:
                auroc = 1.0 - auroc
            scores.append(auroc)
        return np.mean(scores)
Пример #9
0
def test_hypertransformer_without_transformers_nan_data():
    data = get_input_data_with_nan()

    ht = HyperTransformer()
    ht.fit(data)
    transformed = ht.transform(data)

    expected = get_transformed_nan_data()

    np.testing.assert_allclose(
        transformed.sort_index(axis=1).values,
        expected.sort_index(axis=1).values)

    reversed_data = ht.reverse_transform(transformed)

    original_names = data.pop('names')
    reversed_names = reversed_data.pop('names')

    pd.testing.assert_frame_equal(data.sort_index(axis=1),
                                  reversed_data.sort_index(axis=1))

    for name in original_names:
        assert name not in reversed_names
Пример #10
0
class ColumnsModel:
    """ColumnsModel class.

    The ``ColumnsModel`` class enables the usage of conditional sampling when a column is a
    ``constraint``.
    """

    _columns_model = None

    def __init__(self, constraint, constraint_columns):
        if isinstance(constraint_columns, list):
            self.constraint_columns = constraint_columns
        else:
            self.constraint_columns = [constraint_columns]

        self.constraint = constraint

    def fit(self, table_data):
        """Fit the ``ColumnsModel``.

        Fit a ``GaussianUnivariate`` model to the ``self.constraint_column`` columns in the
        ``table_data`` in order to sample those columns when missing.

        Args:
            table_data (pandas.DataFrame):
                Table data.
        """
        data_to_model = table_data[self.constraint_columns]
        self._hyper_transformer = HyperTransformer(
            default_data_type_transformers={
                'categorical': 'OneHotEncodingTransformer'
            })
        transformed_data = self._hyper_transformer.fit_transform(data_to_model)
        self._model = GaussianMultivariate(distribution=GaussianUnivariate)
        self._model.fit(transformed_data)

    def _reject_sample(self, num_rows, conditions):
        sampled = self._model.sample(num_rows=num_rows, conditions=conditions)
        sampled = self._hyper_transformer.reverse_transform(sampled)
        valid_rows = sampled[self.constraint.is_valid(sampled)]
        counter = 0
        total_sampled = num_rows

        while len(valid_rows) < num_rows:
            num_valid = len(valid_rows)
            if counter >= 100:
                if len(valid_rows) == 0:
                    raise ValueError(
                        'Could not get enough valid rows within 100 trials.')
                else:
                    multiplier = num_rows // num_valid
                    num_rows_missing = num_rows % num_valid
                    remainder_rows = valid_rows.iloc[0:num_rows_missing, :]
                    valid_rows = pd.concat([valid_rows] * multiplier +
                                           [remainder_rows],
                                           ignore_index=True)
                    break

            remaining = num_rows - num_valid
            valid_probability = (num_valid + 1) / (total_sampled + 1)
            max_rows = num_rows * 10
            num_to_sample = min(int(remaining / valid_probability), max_rows)
            total_sampled += num_to_sample
            new_sampled = self._model.sample(num_rows=num_to_sample,
                                             conditions=conditions)
            new_sampled = self._hyper_transformer.reverse_transform(
                new_sampled)
            new_valid_rows = new_sampled[self.constraint.is_valid(new_sampled)]
            valid_rows = pd.concat([valid_rows, new_valid_rows],
                                   ignore_index=True)
            counter += 1

        return valid_rows.iloc[0:num_rows, :]

    def sample(self, table_data):
        """Sample any missing columns.

        Sample any missing columns, ``self.constraint_columns``, that ``table_data``
        does not contain.

        Args:
            table_data (pandas.DataFrame):
                Table data.

        Returns:
            pandas.DataFrame:
                Table data with additional ``constraint_columns``.
        """
        condition_columns = [
            c for c in self.constraint_columns if c in table_data.columns
        ]
        grouped_conditions = table_data[condition_columns].groupby(
            condition_columns)
        all_sampled_rows = list()
        for group, df in grouped_conditions:
            if not isinstance(group, tuple):
                group = [group]

            transformed_condition = self._hyper_transformer.transform(
                df).iloc[0].to_dict()
            sampled_rows = self._reject_sample(
                num_rows=df.shape[0], conditions=transformed_condition)
            all_sampled_rows.append(sampled_rows)

        sampled_data = pd.concat(all_sampled_rows, ignore_index=True)
        return sampled_data
Пример #11
0
    def compute(cls, real_data, synthetic_data, metadata=None, dtypes=None):
        """Compute this metric.

        This builds a Machine Learning Classifier that learns to tell the synthetic
        data apart from the real data, which later on is evaluated using Cross Validation.

        The output of the metric is one minus the average ROC AUC score obtained.

        Args:
            real_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the real dataset.
            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict. If not passed, it is build based on the
                real_data fields and dtypes.

        Returns:
            float:
                One minus the ROC AUC Cross Validation Score obtained by the classifier.
        """
        metadata = cls._validate_inputs(real_data, synthetic_data, metadata)

        transformer = HyperTransformer(
            dtype_transformers={'O': 'one_hot_encoding'}, dtypes=dtypes)
        real_data = transformer.fit_transform(real_data).values
        synthetic_data = transformer.transform(synthetic_data).values

        X = np.concatenate([real_data, synthetic_data])
        y = np.hstack([np.ones(len(real_data)), np.zeros(len(synthetic_data))])
        if np.isin(X, [np.inf, -np.inf]).any():
            X[np.isin(X, [np.inf, -np.inf])] = np.nan

        try:
            scores = []
            kf = StratifiedKFold(n_splits=3, shuffle=True)
            for train_index, test_index in kf.split(X, y):
                y_pred, clf = cls._fit_predict(X[train_index], y[train_index],
                                               X[test_index])
                roc_auc = roc_auc_score(y[test_index], y_pred)
                scores.append(max(0.5, roc_auc) * 2 - 1)

            plot = False
            if plot:
                fpr, tpr, _ = roc_curve(y[test_index], y_pred)
                dummy_fpr = np.linspace(0, 1)
                dummy_tpr = np.linspace(0, 1)
                # plot the roc curve for the model
                plt.plot(dummy_fpr,
                         dummy_tpr,
                         linestyle='--',
                         label="Random Classifier")
                plt.plot(fpr, tpr, marker=',', label='ROC-curve')
                plt.fill_between(dummy_tpr, tpr)
                # axis labels
                plt.title("ROC-Curve Churn")
                plt.xlabel('False-Positive Rate')
                plt.ylabel('True-Positive Rate')
                # show the legend
                plt.legend()
                # show the plot
                plt.show()

            return 1 - np.mean(scores)

        except ValueError as err:
            LOGGER.info('DetectionMetric: Skipping due to %s', err)
            return np.nan
Пример #12
0
class Constraint(metaclass=ConstraintMeta):
    """Constraint base class.

    This class is not intended to be used directly and should rather be
    subclassed to create different types of constraints.

    If ``handling_strategy`` is passed with the value ``transform``
    or ``reject_sampling``, the ``filter_valid`` or ``transform`` and
    ``reverse_transform`` methods will be replaced respectively by a simple
    identity function.

    Attributes:
        constraint_columns (tuple[str]):
            The names of the columns used by this constraint.
        rebuild_columns (typle[str]):
            The names of the columns that this constraint will rebuild during
            ``reverse_transform``.
    Args:
        handling_strategy (str):
            How this Constraint should be handled, which can be ``transform``,
            ``reject_sampling`` or ``all``.
        fit_columns_model (bool):
            If False, reject sampling will be used to handle conditional sampling.
            Otherwise, a model will be trained and used to sample other columns
            based on the conditioned column.
    """

    constraint_columns = ()
    rebuild_columns = ()
    _hyper_transformer = None
    _columns_model = None

    def _identity(self, table_data):
        return table_data

    def __init__(self, handling_strategy, fit_columns_model=False):
        self.fit_columns_model = fit_columns_model
        if handling_strategy == 'transform':
            self.filter_valid = self._identity
        elif handling_strategy == 'reject_sampling':
            self.rebuild_columns = ()
            self.transform = self._identity
            self.reverse_transform = self._identity
        elif handling_strategy != 'all':
            raise ValueError(
                'Unknown handling strategy: {}'.format(handling_strategy))

    def _fit(self, table_data):
        del table_data

    def fit(self, table_data):
        """Fit ``Constraint`` class to data.

        If ``fit_columns_model`` is True, then this method will fit
        a ``GaussianCopula`` model to the relevant columns in ``table_data``.
        Subclasses can overwrite this method, or overwrite the ``_fit`` method
        if they will not be needing the model to handle conditional sampling.

        Args:
            table_data (pandas.DataFrame):
                Table data.
        """
        self._fit(table_data)

        if self.fit_columns_model and len(self.constraint_columns) > 1:
            data_to_model = table_data[list(self.constraint_columns)]
            self._hyper_transformer = HyperTransformer(dtype_transformers={
                'O': 'one_hot_encoding',
            })
            transformed_data = self._hyper_transformer.fit_transform(
                data_to_model)
            self._columns_model = GaussianMultivariate(
                distribution=GaussianUnivariate)
            self._columns_model.fit(transformed_data)

    def _transform(self, table_data):
        return table_data

    def _reject_sample(self, num_rows, conditions):
        sampled = self._columns_model.sample(num_rows=num_rows,
                                             conditions=conditions)
        sampled = self._hyper_transformer.reverse_transform(sampled)
        valid_rows = sampled[self.is_valid(sampled)]
        counter = 0
        total_sampled = num_rows

        while len(valid_rows) < num_rows:
            num_valid = len(valid_rows)
            if counter >= 100:
                if len(valid_rows) == 0:
                    error = 'Could not get enough valid rows within 100 trials.'
                    raise ValueError(error)
                else:
                    multiplier = num_rows // num_valid
                    num_rows_missing = num_rows % num_valid
                    remainder_rows = valid_rows.iloc[0:num_rows_missing, :]
                    valid_rows = pd.concat([valid_rows] * multiplier +
                                           [remainder_rows],
                                           ignore_index=True)
                    break

            remaining = num_rows - num_valid
            valid_probability = (num_valid + 1) / (total_sampled + 1)
            max_rows = num_rows * 10
            num_to_sample = min(int(remaining / valid_probability), max_rows)
            total_sampled += num_to_sample
            new_sampled = self._columns_model.sample(num_rows=num_to_sample,
                                                     conditions=conditions)
            new_sampled = self._hyper_transformer.reverse_transform(
                new_sampled)
            new_valid_rows = new_sampled[self.is_valid(new_sampled)]
            valid_rows = pd.concat([valid_rows, new_valid_rows],
                                   ignore_index=True)
            counter += 1

        return valid_rows.iloc[0:num_rows, :]

    def _sample_constraint_columns(self, table_data):
        condition_columns = [
            c for c in self.constraint_columns if c in table_data.columns
        ]
        grouped_conditions = table_data[condition_columns].groupby(
            condition_columns)
        all_sampled_rows = list()
        for group, df in grouped_conditions:
            if not isinstance(group, tuple):
                group = [group]

            transformed_condition = self._hyper_transformer.transform(
                df).iloc[0].to_dict()
            sampled_rows = self._reject_sample(
                num_rows=df.shape[0], conditions=transformed_condition)
            all_sampled_rows.append(sampled_rows)

        sampled_data = pd.concat(all_sampled_rows, ignore_index=True)
        return sampled_data

    def _validate_constraint_columns(self, table_data):
        """Validate the columns in ``table_data``.

        If ``fit_columns_model`` is False and any columns in ``constraint_columns``
        are not present in ``table_data``, this method will raise a
        ``MissingConstraintColumnError``. Otherwise it will return the ``table_data``
        unchanged. If ``fit_columns_model`` is True, then this method will sample
        any missing ``constraint_columns`` from its model conditioned on the
        ``constraint_columns`` that ``table_data`` does contain. If ``table_data``
        doesn't contain any of the ``constraint_columns`` then a
        ``MissingConstraintColumnError`` will be raised.

        Args:
            table_data (pandas.DataFrame):
                Table data.
        """
        missing_columns = [
            col for col in self.constraint_columns
            if col not in table_data.columns
        ]
        if missing_columns:
            if not self._columns_model:
                warning_message = (
                    'When `fit_columns_model` is False and we are conditioning on a subset '
                    'of the constraint columns, conditional sampling uses reject sampling '
                    'which can be slow. Changing `fit_columns_model` to True can improve '
                    'the performance.')
                warnings.warn(warning_message, UserWarning)

            all_columns_missing = len(missing_columns) == len(
                self.constraint_columns)
            if self._columns_model is None or all_columns_missing:
                raise MissingConstraintColumnError()

            else:
                sampled_data = self._sample_constraint_columns(table_data)
                other_columns = [
                    c for c in table_data.columns
                    if c not in self.constraint_columns
                ]
                sampled_data[other_columns] = table_data[other_columns]
                return sampled_data

        return table_data

    def transform(self, table_data):
        """Perform necessary transformations needed by constraint.

        Subclasses can optionally overwrite this method. If the transformation
        requires certain columns to be present in ``table_data``, then the subclass
        should overwrite the ``_transform`` method instead. This method raises a
        ``MissingConstraintColumnError`` if the ``table_data`` is missing any columns
        needed to do the transformation. If columns are present, this method will call
        the ``_transform`` method.

        Args:
            table_data (pandas.DataFrame):
                Table data.

        Returns:
            pandas.DataFrame:
                Input data unmodified.
        """
        table_data = self._validate_constraint_columns(table_data)
        return self._transform(table_data)

    def fit_transform(self, table_data):
        """Fit this Constraint to the data and then transform it.

        Args:
            table_data (pandas.DataFrame):
                Table data.

        Returns:
            pandas.DataFrame:
                Transformed data.
        """
        self.fit(table_data)
        return self.transform(table_data)

    def reverse_transform(self, table_data):
        """Identity method for completion. To be optionally overwritten by subclasses.

        Args:
            table_data (pandas.DataFrame):
                Table data.

        Returns:
            pandas.DataFrame:
                Input data unmodified.
        """
        return table_data

    def is_valid(self, table_data):
        """Say whether the given table rows are valid.

        This is a dummy version of the method that returns a series of ``True``
        values to avoid dropping any rows. This should be overwritten by all
        the subclasses that have a way to decide which rows are valid and which
        are not.

        Args:
            table_data (pandas.DataFrame):
                Table data.

        Returns:
            pandas.Series:
                Series of ``True`` values
        """
        return pd.Series(True, index=table_data.index)

    def filter_valid(self, table_data):
        """Get only the rows that are valid.

        The filtering is done by calling the method ``is_valid``, which should
        be overwritten by subclasses, while this method should stay untouched.

        Args:
            table_data (pandas.DataFrame):
                Table data.

        Returns:
            pandas.DataFrame:
                Input data unmodified.
        """
        valid = self.is_valid(table_data)
        invalid = sum(~valid)
        if invalid:
            LOGGER.debug('%s: %s invalid rows out of %s.',
                         self.__class__.__name__, sum(~valid), len(valid))

        if isinstance(valid, pd.Series):
            return table_data[valid.values]

        return table_data[valid]

    @classmethod
    def from_dict(cls, constraint_dict):
        """Build a Constraint object from a dict.

        Args:
            constraint_dict (dict):
                Dict containing the keyword ``constraint`` alongside
                any additional arguments needed to create the instance.

        Returns:
            Constraint:
                New constraint instance.
        """
        constraint_dict = constraint_dict.copy()
        constraint_class = constraint_dict.pop('constraint')
        subclasses = get_subclasses(cls)
        if isinstance(constraint_class, str):
            if '.' in constraint_class:
                constraint_class = import_object(constraint_class)
            else:
                constraint_class = subclasses[constraint_class]

        return constraint_class(**constraint_dict)

    def to_dict(self):
        """Return a dict representation of this Constraint.

        The dictionary will contain the Qualified Name of the constraint
        class in the key ``constraint``, as well as any other arguments
        that were passed to the constructor when the instance was created.

        Returns:
            dict:
                Dict representation of this Constraint.
        """
        constraint_dict = {
            'constraint': _get_qualified_name(self.__class__),
        }

        for key, obj in copy.deepcopy(self.__kwargs__).items():
            if callable(obj) and _module_contains_callable_name(obj):
                constraint_dict[key] = _get_qualified_name(obj)
            else:
                constraint_dict[key] = obj

        return constraint_dict