Exemplo n.º 1
0
    def test__transform_constraints_raises_error(self):
        """Test that method raises error when specified.

        The ``_transform_constraints`` method is expected to raise ``MissingConstraintColumnError``
        if the constraint transform raises one and ``on_missing_column`` is set to error.

        Input:
        - Table data
        Side Effects:
        - MissingConstraintColumnError
        """
        # Setup
        data = pd.DataFrame({
            'item 0': [0, 1, 2],
            'item 1': [3, 4, 5]
        },
                            index=[0, 1, 2])
        constraint_mock = Mock()
        constraint_mock.transform.side_effect = MissingConstraintColumnError
        table_mock = Mock()
        table_mock._constraints = [constraint_mock]

        # Run/Assert
        with pytest.raises(MissingConstraintColumnError):
            Table._transform_constraints(table_mock, data, 'error')
Exemplo n.º 2
0
Arquivo: base.py Projeto: sdv-dev/SDV
    def __init__(self, field_names=None, field_types=None, anonymize_fields=None,
                 primary_key=None, entity_columns=None, context_columns=None,
                 sequence_index=None, segment_size=None, context_model=None,
                 table_metadata=None):
        if table_metadata is None:
            self._metadata = Table(
                field_names=field_names,
                primary_key=primary_key,
                field_types=field_types,
                anonymize_fields=anonymize_fields,
                dtype_transformers=self._DTYPE_TRANSFORMERS,
                sequence_index=sequence_index,
                entity_columns=entity_columns,
                context_columns=context_columns,
            )
            self._metadata_fitted = False
        else:
            null_args = (
                field_names,
                primary_key,
                field_types,
                anonymize_fields,
                sequence_index,
                entity_columns,
                context_columns
            )
            for arg in null_args:
                if arg:
                    raise ValueError(
                        'If table_metadata is given {} must be None'.format(arg.__name__))

            if isinstance(table_metadata, dict):
                table_metadata = Table.from_dict(
                    table_metadata,
                    dtype_transformers=self._DTYPE_TRANSFORMERS,
                )

            self._metadata = table_metadata
            self._metadata_fitted = table_metadata.fitted

        # Validate arguments
        if segment_size is not None and not isinstance(segment_size, int):
            if sequence_index is None:
                raise TypeError(
                    '`segment_size` must be of type `int` if '
                    'no `sequence_index` is given.'
                )

            segment_size = pd.to_timedelta(segment_size)

        self._context_columns = self._metadata._context_columns
        self._entity_columns = self._metadata._entity_columns
        self._sequence_index = self._metadata._sequence_index
        self._segment_size = segment_size

        context_model = context_model or 'gaussian_copula'
        if isinstance(context_model, str):
            context_model = self._CONTEXT_MODELS[context_model]

        self._context_model_template = context_model
Exemplo n.º 3
0
    def test_fit_constraint_transform_errors(self):
        """Test the ``fit`` method when constraints error on transform.

        The ``fit`` method should loop through all the constraints and try to fit them. Then it
        should loop through again and try to transform. If any errors are raised, they should be
        caught and surfaced together.

        Setup:
            - Set the ``_constraints`` to be a list of mocked constraints.
            - Set constraint mocks to raise Exceptions when calling transform.

        Input:
            - A ``pandas.DataFrame``.

        Side effect:
            - A ``MultipleConstraintsErrors`` error should be raised.
        """
        # Setup
        data = pd.DataFrame({'a': [1, 2, 3]})
        instance = Table()
        constraint1 = Mock()
        constraint2 = Mock()
        constraint1.transform.side_effect = Exception('error 1')
        constraint2.transform.side_effect = Exception('error 2')
        instance._constraints = [constraint1, constraint2]

        # Run / Assert
        error_message = re.escape('\nerror 1\n\nerror 2')
        with pytest.raises(MultipleConstraintsErrors, match=error_message):
            instance.fit(data)

        constraint1.fit.assert_called_once_with(data)
        constraint2.fit.assert_called_once_with(data)
Exemplo n.º 4
0
    def test__transform_constraints_is_condition_false_returns_data(self):
        """Test that method returns data unchanged when necessary.

        The ``_transform_constraints`` method is expected to return data unchanged
        when the constraint transform raises a ``MissingConstraintColumnError`` and the
        ``is_condition`` flag is False.

        Input:
            - Table data
        Output:
            - Table with dropped columns
        """
        # Setup
        data = pd.DataFrame({
            'item 0': [0, 1, 2],
            'item 1': [3, 4, 5]
        }, index=[0, 1, 2])
        constraint_mock = Mock()
        constraint_mock.transform.side_effect = MissingConstraintColumnError(missing_columns=[])
        constraint_mock.constraint_columns = ['item 0']
        table_instance = Table()
        table_instance._constraints = [constraint_mock]
        table_instance._constraints_to_reverse = [constraint_mock]

        # Run
        result = table_instance._transform_constraints(data, False)

        # Assert
        expected_result = pd.DataFrame({
            'item 0': [0, 1, 2],
            'item 1': [3, 4, 5]
        }, index=[0, 1, 2])
        assert result.equals(expected_result)
        assert table_instance._constraints_to_reverse == []
Exemplo n.º 5
0
    def test__prepare_constraints_invalid_order_raises_exception(
            self, from_dict_mock):
        """Test the ``_prepare_constraints`` method validates the constraint order.

        If one constraint has ``rebuild_columns`` that are in a later
        constraint's ``constraint_columns``, an exception should be raised.

        Input:
        - List of constraints with some having ``rebuild_columns``
        that are in a later constraint's ``constraint_columns``.
        Side Effect:
        - Exception should be raised.
        """
        # Setup
        constraint1 = Constraint(handling_strategy='reject_sampling')
        constraint2 = Constraint(handling_strategy='reject_sampling')
        constraint3 = Constraint(handling_strategy='transform')
        constraint4 = Constraint(handling_strategy='transform')
        constraints = [constraint1, constraint2, constraint3, constraint4]
        constraint3.rebuild_columns = ['a', 'd']
        constraint4.constraint_columns = ['a', 'b', 'c']
        constraint4.rebuild_columns = ['a']
        from_dict_mock.side_effect = [
            constraint1, constraint2, constraint3, constraint4
        ]

        # Run
        with pytest.raises(Exception):
            Table._prepare_constraints(constraints)
Exemplo n.º 6
0
    def __init__(self,
                 field_names=None,
                 field_types=None,
                 field_transformers=None,
                 anonymize_fields=None,
                 primary_key=None,
                 constraints=None,
                 table_metadata=None):
        if table_metadata is None:
            self._metadata = Table(
                field_names=field_names,
                primary_key=primary_key,
                field_types=field_types,
                anonymize_fields=anonymize_fields,
                constraints=constraints,
                dtype_transformers=self._DTYPE_TRANSFORMERS,
            )
            self._metadata_fitted = False
        else:
            for arg in (field_names, primary_key, field_types,
                        anonymize_fields, constraints):
                if arg:
                    raise ValueError(
                        'If table_metadata is given {} must be None'.format(
                            arg.__name__))

            if isinstance(table_metadata, dict):
                table_metadata = Table.from_dict(table_metadata)

            self._metadata = table_metadata
            self._metadata_fitted = table_metadata.fitted
Exemplo n.º 7
0
    def test__make_anonymization_mappings(self, mock_table):
        """Test that ``_make_anonymization_mappings`` creates the expected mappings.

        The ``_make_anonymization_mappings`` method should map values in the original
        data to fake values for non-id fields that are labeled pii.

        Setup:
        - Create a Table that has metadata about three fields (one pii field, one id field,
          and one non-pii field).
        Input:
        - Data that contains a pii field, an id field, and a non-pii field.
        Side Effects:
        - Expect ``_get_fake_values`` to be called with the number of unique values of the
          pii field.
        - Expect the resulting `_ANONYMIZATION_MAPPINGS` field to contain the pii field, with
          the correct number of mappings and keys.
        """
        # Setup
        metadata = Mock()
        metadata._ANONYMIZATION_MAPPINGS = {}
        foo_metadata = {
            'type': 'categorical',
            'pii': True,
            'pii_category': 'email',
        }
        metadata._fields_metadata = {
            'foo': foo_metadata,
            'bar': {
                'type': 'categorical',
            },
            'baz': {
                'type': 'id',
            }
        }
        foo_values = [
            '*****@*****.**', '*****@*****.**', '*****@*****.**'
        ]
        data = pd.DataFrame({
            'foo': foo_values,
            'bar': ['a', 'b', 'c'],
            'baz': [1, 2, 3],
        })

        # Run
        Table._make_anonymization_mappings(metadata, data)

        # Assert
        assert mock_table._get_fake_values.called_once_with(foo_metadata, 3)

        mappings = metadata._ANONYMIZATION_MAPPINGS[id(metadata)]
        assert len(mappings) == 1

        foo_mappings = mappings['foo']
        assert len(foo_mappings) == 3
        assert list(foo_mappings.keys()) == foo_values
Exemplo n.º 8
0
    def test__transform_constraints_drops_columns(self):
        """Test that method drops columns when specified.

        The ``_transform_constraints`` method is expected to drop columns associated with
        a constraint its transform raises a MissingConstraintColumnError and ``on_missing_column``
        is set to drop.

        Input:
        - Table data
        Output:
        - Table with dropped columns
        """
        # Setup
        data = pd.DataFrame({
            'item 0': [0, 1, 2],
            'item 1': [3, 4, 5]
        },
                            index=[0, 1, 2])
        constraint_mock = Mock()
        constraint_mock.transform.side_effect = MissingConstraintColumnError
        constraint_mock.constraint_columns = ['item 0']
        table_mock = Mock()
        table_mock._constraints = [constraint_mock]

        # Run
        result = Table._transform_constraints(table_mock, data, 'drop')

        # Assert
        expected_result = pd.DataFrame({'item 1': [3, 4, 5]}, index=[0, 1, 2])
        assert result.equals(expected_result)
Exemplo n.º 9
0
    def test__make_ids_unique_field_index_out_of_order(self):
        """Test that updated id column is unique even if index is out of order."""
        metadata_dict = {
            'fields': {
                'item 0': {
                    'type': 'id',
                    'subtype': 'integer'
                },
                'item 1': {
                    'type': 'boolean'
                }
            },
            'primary_key': 'item 0'
        }
        metadata = Table.from_dict(metadata_dict)
        data = pd.DataFrame(
            {
                'item 0': [0, 1, 1, 2, 3, 5, 5, 6],
                'item 1': [True, True, False, False, True, False, False, True]
            },
            index=[0, 1, 1, 2, 3, 5, 5, 6])

        new_data = metadata.make_ids_unique(data)

        assert new_data['item 1'].equals(data['item 1'])
        assert new_data['item 0'].is_unique
Exemplo n.º 10
0
    def test___init__(self, transformer_mock):
        """Test that ``__init__`` method passes parameters.

        The ``__init__`` method should pass the custom parameters
        to the ``NumericalTransformer``.

        Input:
        - rounding set to an int
        - max_value set to an int
        - min_value set to an int
        Side Effects:
        - ``NumericalTransformer`` should receive the correct parameters
        """
        # Run
        Table(rounding=-1, max_value=100, min_value=-50)

        # Asserts
        assert len(transformer_mock.mock_calls) == 2
        transformer_mock.assert_any_call(dtype=int,
                                         rounding=-1,
                                         max_value=100,
                                         min_value=-50)
        transformer_mock.assert_any_call(dtype=float,
                                         rounding=-1,
                                         max_value=100,
                                         min_value=-50)
Exemplo n.º 11
0
    def test__prepare_constraints_validates_constraint_order(
            self, from_dict_mock):
        """Test the ``_prepare_constraints`` method validates the constraint order.

        If no constraint has ``rebuild_columns`` that are in a later
        constraint's ``constraint_columns``, no exception should be raised.

        Input:
        - List of constraints with none having ``rebuild_columns``
        that are in a later constraint's ``constraint_columns``.
        Output:
        - Sorted list of constraints.
        """
        # Setup
        constraint1 = Constraint(handling_strategy='reject_sampling')
        constraint2 = Constraint(handling_strategy='reject_sampling')
        constraint3 = Constraint(handling_strategy='transform')
        constraint4 = Constraint(handling_strategy='transform')
        constraints = [constraint1, constraint2, constraint3, constraint4]
        constraint3.rebuild_columns = ['e', 'd']
        constraint4.constraint_columns = ['a', 'b', 'c']
        constraint4.rebuild_columns = ['a']
        from_dict_mock.side_effect = [
            constraint1, constraint2, constraint3, constraint4
        ]

        # Run
        sorted_constraints = Table._prepare_constraints(constraints)

        # Assert
        assert sorted_constraints == constraints
Exemplo n.º 12
0
    def test__prepare_constraints_sorts_constraints_none_rebuild_columns(
            self, from_dict_mock):
        """Test that ``_prepare_constraints`` method sorts constraints.

        The ``_prepare_constraints`` method should sort constraints with None as
        ``rebuild_columns`` before those that have them.

        Input:
        - list of constraints with some having None as ``rebuild_columns``
        listed after those with ``rebuild_columns``.
        Output:
        - List of constraints sorted properly.
        """
        # Setup
        constraint1 = Constraint(handling_strategy='transform')
        constraint2 = Constraint(handling_strategy='transform')
        constraint3 = Constraint(handling_strategy='reject_sampling')
        constraints = [constraint1, constraint2, constraint3]
        constraint1.rebuild_columns = ['a']
        constraint2.rebuild_columns = ['b']
        constraint3.rebuild_columns = None
        from_dict_mock.side_effect = [constraint1, constraint2, constraint3]

        # Run
        sorted_constraints = Table._prepare_constraints(constraints)

        # Asserts
        assert sorted_constraints == [constraint3, constraint1, constraint2]
Exemplo n.º 13
0
    def test__validate_data_on_constraints(self):
        """Test the ``Table._validate_data_on_constraints`` method.

        Expect that the method returns True when the constraint columns are in the given data,
        and the constraint.is_valid method returns True.

        Input:
        - Table data
        Output:
        - None
        Side Effects:
        - No error
        """
        # Setup
        data = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 5]}, index=[0, 1, 2])
        constraint_mock = Mock()
        constraint_mock.is_valid.return_value = pd.Series([True, True, True])
        constraint_mock.constraint_columns = ['a', 'b']
        table_mock = Mock()
        table_mock._constraints = [constraint_mock]

        # Run
        result = Table._validate_data_on_constraints(table_mock, data)

        # Assert
        assert result is None
Exemplo n.º 14
0
    def test__validate_data_on_constraints_missing_cols(self):
        """Test the ``Table._validate_data_on_constraints`` method.

        Expect that the method returns True when the constraint columns are not
        in the given data.

        Input:
        - Table data that is missing a constraint column
        Output:
        - None
        Side Effects:
        - No error
        """
        # Setup
        data = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 5]}, index=[0, 1, 2])
        constraint_mock = Mock()
        constraint_mock.constraint_columns = ['a', 'b', 'c']
        table_mock = Mock()
        table_mock._constraints = [constraint_mock]

        # Run
        result = Table._validate_data_on_constraints(table_mock, data)

        # Assert
        assert result is None
Exemplo n.º 15
0
    def __init__(self, field_names=None, field_types=None, field_transformers=None,
                 anonymize_fields=None, primary_key=None, constraints=None, table_metadata=None,
                 distribution=None, default_distribution=None, categorical_transformer=None):

        if isinstance(table_metadata, dict):
            table_metadata = Table.from_dict(table_metadata)

        if table_metadata:
            model_kwargs = table_metadata.get_model_kwargs(self.__class__.__name__)
            if model_kwargs:
                if distribution is None:
                    distribution = model_kwargs['distribution']

                if categorical_transformer is None:
                    categorical_transformer = model_kwargs['categorical_transformer']

        self._distribution = distribution
        self._default_distribution = default_distribution or 'parametric'

        categorical_transformer = categorical_transformer or self._DEFAULT_TRANSFORMER
        self._categorical_transformer = categorical_transformer
        self._DTYPE_TRANSFORMERS = {'O': categorical_transformer}

        super().__init__(
            field_names=field_names,
            primary_key=primary_key,
            field_types=field_types,
            anonymize_fields=anonymize_fields,
            constraints=constraints,
            table_metadata=table_metadata
        )
Exemplo n.º 16
0
    def __init__(self,
                 field_names=None,
                 field_types=None,
                 field_transformers=None,
                 anonymize_fields=None,
                 primary_key=None,
                 constraints=None,
                 table_metadata=None,
                 field_distributions=None,
                 default_distribution=None,
                 categorical_transformer=None):

        if isinstance(table_metadata, dict):
            table_metadata = Table.from_dict(table_metadata)

        if table_metadata:
            model_kwargs = table_metadata.get_model_kwargs(
                self.__class__.__name__)
            if model_kwargs:
                if field_distributions is None:
                    field_distributions = model_kwargs['field_distributions']

                if default_distribution is None:
                    default_distribution = model_kwargs['default_distribution']

                if categorical_transformer is None:
                    categorical_transformer = model_kwargs[
                        'categorical_transformer']

        if field_distributions and not isinstance(field_distributions, dict):
            raise TypeError(
                'field_distributions can only be None or a dict instance')

        self._field_distributions = {
            field: self._validate_distribution(distribution)
            for field, distribution in (field_distributions or {}).items()
        }
        self._default_distribution = (
            self._validate_distribution(default_distribution)
            or self._DEFAULT_DISTRIBUTION)

        self._categorical_transformer = categorical_transformer or self._DEFAULT_TRANSFORMER
        self._DTYPE_TRANSFORMERS = {'O': self._categorical_transformer}

        super().__init__(
            field_names=field_names,
            field_types=field_types,
            field_transformers=field_transformers,
            anonymize_fields=anonymize_fields,
            primary_key=primary_key,
            constraints=constraints,
            table_metadata=table_metadata,
        )

        self._metadata.set_model_kwargs(
            self.__class__.__name__, {
                'field_distributions': field_distributions,
                'default_distribution': default_distribution,
                'categorical_transformer': categorical_transformer,
            })
Exemplo n.º 17
0
    def test__get_faker_specified_locales_list(self):
        """Test that ``_get_faker`` with locales parameter sets localization correctly.

        The ``_get_faker`` should return a Faker object localized to the specified locales.

        Input:
        - Field metadata from metadata dict.
        Output:
        - Faker object with specified list of localizations.
        """
        # Setup
        metadata_dict = {
            'fields': {
                'foo': {
                    'type': 'categorical',
                    'pii': True,
                    'pii_category': 'company',
                    'pii_locales': ['en_US', 'sv_SE']
                }
            }
        }

        # Run
        faker = Table.from_dict(metadata_dict)._get_faker(metadata_dict['fields']['foo'])

        # Assert
        assert isinstance(faker, Faker)
        assert faker.locales == ['en_US', 'sv_SE']
Exemplo n.º 18
0
    def test__get_faker_default_locale(self):
        """Test that ``_get_faker`` without locales parameter has default locale.

        The ``_get_faker`` should return a Faker object localized to the default locale.
        When no locales are specified explicitly.

        Input:
        - Field metadata from metadata dict.
        Output:
        - Faker object with default localization.
        """
        # Setup
        metadata_dict = {
            'fields': {
                'foo': {
                    'type': 'categorical',
                    'pii': True,
                    'pii_category': 'company'
                }
            }
        }

        # Run
        faker = Table.from_dict(metadata_dict)._get_faker(metadata_dict['fields']['foo'])

        # Assert
        assert isinstance(faker, Faker)
        assert faker.locales == [DEFAULT_LOCALE]
Exemplo n.º 19
0
    def test_from_dict_min_max(self):
        """Test the ``Table.from_dict`` method.

        Expect that when min_value and max_value are not provided,
        they are set to 'auto'.

        Input:
        - A dictionary representing a table's metadata
        Output:
        - A Table object
        """
        # Setup
        metadata_dict = {
            'fields': {
                'item 0': {'type': 'id', 'subtype': 'integer'},
                'item 1': {'type': 'boolean'}
            },
            'primary_key': 'item 0'
        }

        # Run
        metadata = Table.from_dict(metadata_dict)

        # Assert
        assert metadata._transformer_templates['integer'].max_value == 'auto'
        assert metadata._transformer_templates['integer'].min_value == 'auto'
        assert metadata._transformer_templates['integer'].rounding == 'auto'
        assert metadata._transformer_templates['float'].max_value == 'auto'
        assert metadata._transformer_templates['float'].min_value == 'auto'
        assert metadata._transformer_templates['float'].rounding == 'auto'
Exemplo n.º 20
0
    def test__transform_constraints_is_condition_drops_columns(self):
        """Test that method drops columns when necessary.

        The ``_transform_constraints`` method is expected to drop columns associated with
        a constraint when its transform raises a ``MissingConstraintColumnError`` and the
        ``is_condition`` flag is True.

        Input:
            - Table data
            - ``is_condition`` set to True
        Output:
            - Table with dropped columns
        """
        # Setup
        data = pd.DataFrame({
            'item 0': [0, 1, 2],
            'item 1': [3, 4, 5]
        }, index=[0, 1, 2])
        constraint_mock = Mock()
        constraint_mock.transform.side_effect = MissingConstraintColumnError(missing_columns=[])
        constraint_mock.constraint_columns = ['item 0']
        table_mock = Mock()
        table_mock._constraints = [constraint_mock]

        # Run
        result = Table._transform_constraints(table_mock, data, True)

        # Assert
        expected_result = pd.DataFrame({
            'item 1': [3, 4, 5]
        }, index=[0, 1, 2])
        assert result.equals(expected_result)
Exemplo n.º 21
0
    def test__get_faker_method_pass_args(self):
        """Test that ``_get_faker_method`` method utilizes parameters passed in category argument.

        The ``_get_faker_method`` method uses the parameters passed to it in the category argument.

        Input:
        - Faker object to create faked values with.
        - Category tuple of category name and parameters passed to the method creating fake values.
        Output:
        - Fake values created with the specified method from the Faker object.
        Utilizing the arguments given to it.
        """
        # Setup
        metadata_dict = {
            'fields': {
                'foo': {
                    'type': 'categorical',
                    'pii': True,
                    'pii_category': 'ean'
                }
            }
        }
        metadata = Table.from_dict(metadata_dict)

        # Run
        fake_8_ean = metadata._get_faker_method(Faker(), ('ean', 8))
        ean_8 = fake_8_ean()

        fake_13_ean = metadata._get_faker_method(Faker(), ('ean', 13))
        ean_13 = fake_13_ean()

        # Assert
        assert len(ean_8) == 8
        assert len(ean_13) == 13
Exemplo n.º 22
0
    def test___init__calls_prepare_constraints(self,
                                               _prepare_constraints_mock):
        """Test that ``__init__`` method calls ``_prepare_constraints"""
        # Run
        Table(constraints=[])

        # Assert
        _prepare_constraints_mock.called_once_with([])
Exemplo n.º 23
0
    def test_fit_constraint_transform_missing_columns_error(self, warnings_mock):
        """Test the ``fit`` method when transform raises a errors.

        The ``fit`` method should loop through all the constraints and try to fit them. Then it
        should loop through again and try to transform. If a ``MissingConstraintColumnError`` or
        ``FunctionError`` is raised, a warning should be raised and reject sampling should be used.

        Setup:
            - Set the ``_constraints`` to be a list of mocked constraints.
            - Set constraint mocks to raise ``MissingConstraintColumnError`` and ``FunctionError``
            when calling transform.
            - Mock warnings module.

        Input:
            - A ``pandas.DataFrame``.

        Side effect:
            - ``MissingConstraintColumnError`` and ``FunctionError`` warning messages.
        """
        # Setup
        data = pd.DataFrame({'a': [1, 2, 3]})
        instance = Table()
        constraint1 = Mock()
        constraint2 = Mock()
        constraint3 = Mock()
        constraint1.transform.return_value = data
        constraint2.transform.side_effect = MissingConstraintColumnError(['column'])
        constraint3.transform.side_effect = FunctionError()
        instance._constraints = [constraint1, constraint2, constraint3]

        # Run
        instance.fit(data)

        # Assert
        constraint1.fit.assert_called_once_with(data)
        constraint2.fit.assert_called_once_with(data)
        constraint3.fit.assert_called_once_with(data)
        assert warnings_mock.warn.call_count == 2
        warning_message1 = (
            "Mock cannot be transformed because columns: ['column'] were not found. Using the "
            'reject sampling approach instead.'
        )
        warning_message2 = 'Error transforming Mock. Using the reject sampling approach instead.'
        warnings_mock.warn.assert_has_calls([call(warning_message1), call(warning_message2)])
Exemplo n.º 24
0
    def _fit_metadata(self, data):
        """Generate a new Table metadata and fit it to the data.

        The information provided will be used to create the Table instance
        and then the rest of information will be learned from the given
        data.

        Args:
            data (pandas.DataFrame):
                Data to learn from.
        """
        metadata = Table(
            field_names=self._field_names,
            primary_key=self._primary_key,
            field_types=self._field_types,
            anonymize_fields=self._anonymize_fields,
            transformer_templates=self.TRANSFORMER_TEMPLATES,
        )
        metadata.fit(data)

        self._metadata = metadata
Exemplo n.º 25
0
    def test_transform_calls__transform_constraints(self):
        """Test that the `transform` method calls `_transform_constraints` with right parameters

        The ``transform`` method is expected to call the ``_transform_constraints`` method
        with the data and correct value for ``on_missing_column``.

        Input:
        - Table data
        Side Effects:
        - Calls _transform_constraints
        """
        # Setup
        data = pd.DataFrame(
            {
                'item 0': [0, 1, 2],
                'item 1': [True, True, False]
            },
            index=[0, 1, 2])
        dtypes = {'item 0': 'int', 'item 1': 'bool'}
        table_mock = Mock()
        table_mock.get_dtypes.return_value = dtypes
        table_mock._transform_constraints.return_value = data
        table_mock._anonymize.return_value = data
        table_mock._hyper_transformer.transform.return_value = data

        # Run
        Table.transform(table_mock, data, 'error')

        # Assert
        expected_data = pd.DataFrame(
            {
                'item 0': [0, 1, 2],
                'item 1': [True, True, False]
            },
            index=[0, 1, 2])
        mock_calls = table_mock._transform_constraints.mock_calls
        args = mock_calls[0][1]
        assert len(mock_calls) == 1
        assert args[0].equals(expected_data)
        assert args[1] == 'error'
Exemplo n.º 26
0
    def test__make_anonymization_mappings_unique_faked_value_in_field(
            self, mock_table):
        """Test that ``_make_anonymization_mappings`` method creates mappings for anonymized values.

        The ``_make_anonymization_mappings`` method should map equal values in the original data
        to the same faked value.

        Input:
        - DataFrame with a field that should be anonymized based on the metadata description.
        Side Effect:
        - Mappings are created from the original values to faked values.
        """
        # Setup
        metadata = Mock()
        metadata._ANONYMIZATION_MAPPINGS = {}
        foo_metadata = {
            'type': 'categorical',
            'pii': True,
            'pii_category': 'email'
        }
        metadata._fields_metadata = {'foo': foo_metadata}
        data = pd.DataFrame({
            'foo':
            ['*****@*****.**', '*****@*****.**', '*****@*****.**']
        })

        # Run
        Table._make_anonymization_mappings(metadata, data)

        # Assert
        assert mock_table._get_fake_values.called_once_with(foo_metadata, 2)

        mappings = metadata._ANONYMIZATION_MAPPINGS[id(metadata)]
        assert len(mappings) == 1

        foo_mappings = mappings['foo']
        assert len(foo_mappings) == 2
        assert list(
            foo_mappings.keys()) == ['*****@*****.**', '*****@*****.**']
Exemplo n.º 27
0
Arquivo: base.py Projeto: sdv-dev/SDV
    def __init__(self,
                 field_names=None,
                 field_types=None,
                 field_transformers=None,
                 anonymize_fields=None,
                 primary_key=None,
                 constraints=None,
                 table_metadata=None,
                 rounding='auto',
                 min_value='auto',
                 max_value='auto'):
        if table_metadata is None:
            self._metadata = Table(field_names=field_names,
                                   primary_key=primary_key,
                                   field_types=field_types,
                                   field_transformers=field_transformers,
                                   anonymize_fields=anonymize_fields,
                                   constraints=constraints,
                                   dtype_transformers=self._DTYPE_TRANSFORMERS,
                                   rounding=rounding,
                                   min_value=min_value,
                                   max_value=max_value)
            self._metadata_fitted = False
        else:
            table_metadata = deepcopy(table_metadata)
            for arg in (field_names, primary_key, field_types,
                        anonymize_fields, constraints):
                if arg:
                    raise ValueError(
                        'If table_metadata is given {} must be None'.format(
                            arg.__name__))

            if isinstance(table_metadata, dict):
                table_metadata = Table.from_dict(table_metadata)

            table_metadata._dtype_transformers.update(self._DTYPE_TRANSFORMERS)

            self._metadata = table_metadata
            self._metadata_fitted = table_metadata.fitted
Exemplo n.º 28
0
    def test__transform_constraints(self):
        """Test that method correctly transforms data based on constraints

        The ``_transform_constraints`` method is expected to loop through constraints
        and call each constraint's ``transform`` method on the data.

        Input:
            - Table data
        Output:
            - Transformed data
        """
        # Setup
        data = pd.DataFrame({
            'item 0': [0, 1, 2],
            'item 1': [3, 4, 5]
        }, index=[0, 1, 2])
        transformed_data = pd.DataFrame({
            'item 0': [0, 0.5, 1],
            'item 1': [6, 8, 10]
        }, index=[0, 1, 2])
        first_constraint_mock = Mock()
        second_constraint_mock = Mock()
        first_constraint_mock.transform.return_value = transformed_data
        second_constraint_mock.return_value = transformed_data
        table_instance = Table()
        table_instance._constraints = [first_constraint_mock, second_constraint_mock]

        # Run
        result = table_instance._transform_constraints(data)

        # Assert
        assert result.equals(transformed_data)
        first_constraint_mock.transform.assert_called_once_with(data)
        second_constraint_mock.transform.assert_called_once_with(transformed_data)
        assert table_instance._constraints_to_reverse == [
            first_constraint_mock,
            second_constraint_mock
        ]
Exemplo n.º 29
0
    def test_fit_fits_and_transforms_constraints(self):
        """Test the ``fit`` method.

        The ``fit`` method should loop through all the constraints, fit them,
        and then call ``transform`` for all of them.

        Setup:
            - Set the ``_constraints`` to be a list of mocked constraints.

        Input:
            - A ``pandas.DataFrame``.

        Output:
            - Same ``pandas.DataFrame``.

        Side effect:
            - Each constraint should be fit and transform the data.
        """
        # Setup
        data = pd.DataFrame({'a': [1, 2, 3]})
        transformed_data = pd.DataFrame({'a': [4, 5, 6]})
        instance = Table()
        constraint1 = Mock()
        constraint2 = Mock()
        constraint1.transform.return_value = transformed_data
        constraint2.transform.return_value = data
        instance._constraints = [constraint1, constraint2]

        # Run
        instance.fit(data)

        # Assert
        constraint1.fit.assert_called_once_with(data)
        constraint2.fit.assert_called_once_with(data)
        constraint1.transform.assert_called_once_with(data)
        constraint2.transform.assert_called_once_with(transformed_data)
Exemplo n.º 30
0
    def test__validate_data_on_constraints_invalid_input(self):
        """Test the ``Table._validate_data_on_constraints`` method.

        Expect that the method returns False when the constraint columns are in the given data,
        and the constraint.is_valid method returns False for any row.

        Input:
        - Table data contains an invalid row
        Output:
        - None
        Side Effects:
        - A ConstraintsNotMetError is thrown
        """
        # Setup
        data = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 5]}, index=[0, 1, 2])
        constraint_mock = Mock()
        constraint_mock.is_valid.return_value = pd.Series([True, False, True])
        constraint_mock.constraint_columns = ['a', 'b']
        table_mock = Mock()
        table_mock._constraints = [constraint_mock]

        # Run and assert
        with pytest.raises(ConstraintsNotMetError):
            Table._validate_data_on_constraints(table_mock, data)