def test_gaussian_copula(): users = load_demo(metadata=False)['users'] field_types = { 'age': { 'type': 'numerical', 'subtype': 'integer', }, 'country': { 'type': 'categorical' } } anonymize_fields = {'country': 'country_code'} gc = GaussianCopula( field_names=['user_id', 'country', 'gender', 'age'], field_types=field_types, primary_key='user_id', anonymize_fields=anonymize_fields, categorical_transformer='one_hot_encoding', ) gc.fit(users) parameters = gc.get_parameters() new_gc = GaussianCopula( table_metadata=gc.get_metadata(), categorical_transformer='one_hot_encoding', ) new_gc.set_parameters(parameters) sampled = new_gc.sample() # test shape is right assert sampled.shape == users.shape # test user_id has been generated as an ID field assert list(sampled['user_id']) == list(range(0, len(users))) # country codes have been replaced with new ones assert set(sampled.country.unique()) != set(users.country.unique()) metadata = gc.get_metadata().to_dict() assert metadata['fields'] == { 'user_id': { 'type': 'id', 'subtype': 'integer' }, 'country': { 'type': 'categorical' }, 'gender': { 'type': 'categorical' }, 'age': { 'type': 'numerical', 'subtype': 'integer' } } assert 'model_kwargs' in metadata
def test_set_parameters_negative_max_rows(self): """Test the ``set_parameters`` method with negative num_rows. If the max rows value is negative, it is expected to be set to zero. The ``GaussianCopula.set_parameters`` method is expected to: - Transform a flattened dict into its original form with the unflatten_dict function. - pass the unflattended dict to the ``self._rebuild_gaussian_copula`` method. - Store ``0`` in the `self._num_rows` attribute. - Create a GaussianMultivariate instance from the params dict and store it in the 'self._model' attribute. Input: - flat parameters dict Output: - None Side Effects: - Call ``_rebuild_gaussian_copula`` with the unflatted dict. - ``self._num_rows`` is set to ``0``. - ``GaussianMultivariate`` is called - ``GaussianMultivariate`` return value is stored as `self._model` """ # Setup gaussian_copula = Mock(autospec=GaussianCopula) returned = { 'univariates': [{ 'scale': 1.0, 'loc': 5, 'type': 'copulas.univariate.gaussian.GaussianUnivariate' }], 'columns': ['foo'], 'num_rows': -3, 'covariance': [[0.4, 0.17], [0.17, 0.07]] } gaussian_copula._rebuild_gaussian_copula.return_value = returned # Run flatten_parameters = { 'univariates__foo__scale': 0.0, 'univariates__foo__loc': 5, 'covariance__0__0': 0.1, 'covariance__1__0': 0.4, 'covariance__1__1': 0.1, 'num_rows': -3 } GaussianCopula.set_parameters(gaussian_copula, flatten_parameters) # Asserts expected = { 'covariance': [[0.1], [0.4, 0.1]], 'num_rows': -3, 'univariates': { 'foo': { 'loc': 5, 'scale': 0.0 } } } gaussian_copula._rebuild_gaussian_copula.assert_called_once_with( expected) assert gaussian_copula._num_rows == 0 assert isinstance(gaussian_copula._model, GaussianMultivariate)
def test_gaussian_copula(): users = load_demo(metadata=False)['users'] field_types = { 'age': { 'type': 'numerical', 'subtype': 'integer', }, 'country': { 'type': 'categorical' } } anonymize_fields = {'country': 'country_code'} # If distribution is non parametric, get_parameters fails gc = GaussianCopula( field_names=['user_id', 'country', 'gender', 'age'], field_types=field_types, primary_key='user_id', anonymize_fields=anonymize_fields, field_distributions={'age': 'gamma'}, default_distribution='gaussian_kde', ) gc.fit(users) with pytest.raises(NonParametricError): parameters = gc.get_parameters() # If distribution is parametric, copula can be recreated gc = GaussianCopula( field_names=['user_id', 'country', 'gender', 'age'], field_types=field_types, primary_key='user_id', anonymize_fields=anonymize_fields, field_distributions={'age': 'gamma'}, default_distribution='bounded', ) gc.fit(users) parameters = gc.get_parameters() new_gc = GaussianCopula(table_metadata=gc.get_metadata(), ) new_gc.set_parameters(parameters) # Validate sampled dat sampled = new_gc.sample() # test shape is right assert sampled.shape == users.shape # test user_id has been generated as an ID field assert list(sampled['user_id']) == list(range(0, len(users))) # country codes have been replaced with new ones assert set(sampled.country.unique()) != set(users.country.unique()) # Validate metadata metadata = gc.get_metadata().to_dict() assert metadata['fields'] == { 'user_id': { 'type': 'id', 'subtype': 'integer', 'transformer': 'integer', }, 'country': { 'type': 'categorical', 'pii': True, 'pii_category': 'country_code', 'transformer': 'one_hot_encoding', }, 'gender': { 'type': 'categorical', 'transformer': 'one_hot_encoding', }, 'age': { 'type': 'numerical', 'subtype': 'integer', 'transformer': 'integer', } } assert 'model_kwargs' in metadata assert 'GaussianCopula' in metadata['model_kwargs']