예제 #1
0
def test_gaussian_copula():
    users = load_demo(metadata=False)['users']

    field_types = {
        'age': {
            'type': 'numerical',
            'subtype': 'integer',
        },
        'country': {
            'type': 'categorical'
        }
    }
    anonymize_fields = {'country': 'country_code'}

    gc = GaussianCopula(
        field_names=['user_id', 'country', 'gender', 'age'],
        field_types=field_types,
        primary_key='user_id',
        anonymize_fields=anonymize_fields,
        categorical_transformer='one_hot_encoding',
    )
    gc.fit(users)

    parameters = gc.get_parameters()
    new_gc = GaussianCopula(
        table_metadata=gc.get_metadata(),
        categorical_transformer='one_hot_encoding',
    )
    new_gc.set_parameters(parameters)

    sampled = new_gc.sample()

    # test shape is right
    assert sampled.shape == users.shape

    # test user_id has been generated as an ID field
    assert list(sampled['user_id']) == list(range(0, len(users)))

    # country codes have been replaced with new ones
    assert set(sampled.country.unique()) != set(users.country.unique())

    metadata = gc.get_metadata().to_dict()
    assert metadata['fields'] == {
        'user_id': {
            'type': 'id',
            'subtype': 'integer'
        },
        'country': {
            'type': 'categorical'
        },
        'gender': {
            'type': 'categorical'
        },
        'age': {
            'type': 'numerical',
            'subtype': 'integer'
        }
    }

    assert 'model_kwargs' in metadata
예제 #2
0
    def test_set_parameters_negative_max_rows(self):
        """Test the ``set_parameters`` method with negative num_rows.

        If the max rows value is negative, it is expected to be set
        to zero.

        The ``GaussianCopula.set_parameters`` method is expected to:
        - Transform a flattened dict into its original form with
          the unflatten_dict function.
        - pass the unflattended dict to the ``self._rebuild_gaussian_copula``
          method.
        - Store ``0`` in the `self._num_rows` attribute.
        - Create a GaussianMultivariate instance from the params dict
          and store it in the 'self._model' attribute.

        Input:
        - flat parameters dict

        Output:
        - None

        Side Effects:
        - Call ``_rebuild_gaussian_copula`` with the unflatted dict.
        - ``self._num_rows`` is set to ``0``.
        - ``GaussianMultivariate`` is called
        - ``GaussianMultivariate`` return value is stored as `self._model`
        """
        # Setup
        gaussian_copula = Mock(autospec=GaussianCopula)
        returned = {
            'univariates': [{
                'scale':
                1.0,
                'loc':
                5,
                'type':
                'copulas.univariate.gaussian.GaussianUnivariate'
            }],
            'columns': ['foo'],
            'num_rows':
            -3,
            'covariance': [[0.4, 0.17], [0.17, 0.07]]
        }
        gaussian_copula._rebuild_gaussian_copula.return_value = returned

        # Run
        flatten_parameters = {
            'univariates__foo__scale': 0.0,
            'univariates__foo__loc': 5,
            'covariance__0__0': 0.1,
            'covariance__1__0': 0.4,
            'covariance__1__1': 0.1,
            'num_rows': -3
        }
        GaussianCopula.set_parameters(gaussian_copula, flatten_parameters)

        # Asserts
        expected = {
            'covariance': [[0.1], [0.4, 0.1]],
            'num_rows': -3,
            'univariates': {
                'foo': {
                    'loc': 5,
                    'scale': 0.0
                }
            }
        }
        gaussian_copula._rebuild_gaussian_copula.assert_called_once_with(
            expected)
        assert gaussian_copula._num_rows == 0
        assert isinstance(gaussian_copula._model, GaussianMultivariate)
예제 #3
0
def test_gaussian_copula():
    users = load_demo(metadata=False)['users']

    field_types = {
        'age': {
            'type': 'numerical',
            'subtype': 'integer',
        },
        'country': {
            'type': 'categorical'
        }
    }
    anonymize_fields = {'country': 'country_code'}

    # If distribution is non parametric, get_parameters fails
    gc = GaussianCopula(
        field_names=['user_id', 'country', 'gender', 'age'],
        field_types=field_types,
        primary_key='user_id',
        anonymize_fields=anonymize_fields,
        field_distributions={'age': 'gamma'},
        default_distribution='gaussian_kde',
    )
    gc.fit(users)
    with pytest.raises(NonParametricError):
        parameters = gc.get_parameters()

    # If distribution is parametric, copula can be recreated
    gc = GaussianCopula(
        field_names=['user_id', 'country', 'gender', 'age'],
        field_types=field_types,
        primary_key='user_id',
        anonymize_fields=anonymize_fields,
        field_distributions={'age': 'gamma'},
        default_distribution='bounded',
    )
    gc.fit(users)

    parameters = gc.get_parameters()
    new_gc = GaussianCopula(table_metadata=gc.get_metadata(), )
    new_gc.set_parameters(parameters)

    # Validate sampled dat
    sampled = new_gc.sample()

    # test shape is right
    assert sampled.shape == users.shape

    # test user_id has been generated as an ID field
    assert list(sampled['user_id']) == list(range(0, len(users)))

    # country codes have been replaced with new ones
    assert set(sampled.country.unique()) != set(users.country.unique())

    # Validate metadata
    metadata = gc.get_metadata().to_dict()
    assert metadata['fields'] == {
        'user_id': {
            'type': 'id',
            'subtype': 'integer',
            'transformer': 'integer',
        },
        'country': {
            'type': 'categorical',
            'pii': True,
            'pii_category': 'country_code',
            'transformer': 'one_hot_encoding',
        },
        'gender': {
            'type': 'categorical',
            'transformer': 'one_hot_encoding',
        },
        'age': {
            'type': 'numerical',
            'subtype': 'integer',
            'transformer': 'integer',
        }
    }

    assert 'model_kwargs' in metadata
    assert 'GaussianCopula' in metadata['model_kwargs']