def test_gaussian_copula(): users = load_demo(metadata=False)['users'] field_types = { 'age': { 'type': 'numerical', 'subtype': 'integer', }, 'country': { 'type': 'categorical' } } anonymize_fields = {'country': 'country_code'} gc = GaussianCopula( field_names=['user_id', 'country', 'gender', 'age'], field_types=field_types, primary_key='user_id', anonymize_fields=anonymize_fields, categorical_transformer='one_hot_encoding', ) gc.fit(users) parameters = gc.get_parameters() new_gc = GaussianCopula( table_metadata=gc.get_metadata(), categorical_transformer='one_hot_encoding', ) new_gc.set_parameters(parameters) sampled = new_gc.sample() # test shape is right assert sampled.shape == users.shape # test user_id has been generated as an ID field assert list(sampled['user_id']) == list(range(0, len(users))) # country codes have been replaced with new ones assert set(sampled.country.unique()) != set(users.country.unique()) metadata = gc.get_metadata().to_dict() assert metadata['fields'] == { 'user_id': { 'type': 'id', 'subtype': 'integer' }, 'country': { 'type': 'categorical' }, 'gender': { 'type': 'categorical' }, 'age': { 'type': 'numerical', 'subtype': 'integer' } } assert 'model_kwargs' in metadata
def test___init___copies_metadata(): """Test the ``__init__`` method. This test assures that the metadata provided to the model is copied, so that any modifications don't change the input. Setup: - Initialize two models with the same metadata and data. Expected behavior: - The metadata for each model and the provided metadata should all be different. """ # Setup metadata, data = load_tabular_demo('student_placements', metadata=True) # Run model = GaussianCopula(table_metadata=metadata, categorical_transformer='label_encoding', default_distribution='gamma') model.fit(data) model2 = GaussianCopula(table_metadata=metadata, categorical_transformer='label_encoding', default_distribution='beta') model2.fit(data) # Assert assert model._metadata != metadata assert model._metadata != model2._metadata assert model2._metadata != metadata gamma = 'copulas.univariate.gamma.GammaUnivariate' beta = 'copulas.univariate.beta.BetaUnivariate' assert all(distribution == gamma for distribution in model.get_distributions().values()) assert all(distribution == beta for distribution in model2.get_distributions().values())
def test_conditional_sampling_constraint_uses_reject_sampling( gm_mock, isinstance_mock): """Test that the ``sample`` method handles constraints with conditions. The ``sample`` method is expected to properly apply constraint transformations by dropping columns that cannot be conditonally sampled on due to them being part of a constraint. Setup: - The model is being passed a ``UniqueCombination`` constraint and then asked to sample with two conditions, one of which the constraint depends on. The constraint is expected to skip its transformations since only some of the columns are provided by the conditions and the model will use reject sampling to meet the constraint instead. Input: - Conditions Side Effects: - Correct columns to condition on are passed to underlying sample method """ # Setup isinstance_mock.side_effect = _isinstance_side_effect constraint = FixedCombinations(column_names=['city', 'state']) data = pd.DataFrame({ 'city': ['LA', 'SF', 'CHI', 'LA', 'LA'], 'state': ['CA', 'CA', 'IL', 'CA', 'CA'], 'age': [27, 28, 26, 21, 30] }) model = GaussianCopula(constraints=[constraint], categorical_transformer='label_encoding') sampled_numeric_data = [ pd.DataFrame({ 'city#state.value': [0, 1, 2, 0, 0], 'age.value': [30, 30, 30, 30, 30] }), pd.DataFrame({ 'city#state.value': [1], 'age.value': [30] }) ] gm_mock.return_value.sample.side_effect = sampled_numeric_data model.fit(data) # Run conditions = [Condition({'age': 30, 'state': 'CA'}, num_rows=5)] sampled_data = model.sample_conditions(conditions=conditions) # Assert expected_transformed_conditions = {'age.value': 30} expected_data = pd.DataFrame({ 'city': ['LA', 'SF', 'LA', 'LA', 'SF'], 'state': ['CA', 'CA', 'CA', 'CA', 'CA'], 'age': [30, 30, 30, 30, 30] }) sample_calls = model._model.sample.mock_calls assert len(sample_calls) == 2 model._model.sample.assert_any_call( 50, conditions=expected_transformed_conditions) pd.testing.assert_frame_equal(sampled_data, expected_data)
def test_sample_empty_transformed_conditions(): """Test that None is passed to ``_sample_batch`` if transformed conditions are empty. The ``Sample`` method is expected to: - Return sampled data and pass None to ``sample_batch`` as the ``transformed_conditions``. Input: - Number of rows to sample - Conditions Output: - Sampled data """ # Setup model = GaussianCopula() data = pd.DataFrame({ 'column1': list(range(100)), 'column2': list(range(100)), 'column3': list(range(100)) }) conditions = {'column1': 25} conditions_series = pd.Series([25, 25, 25, 25, 25], name='column1') model._sample_batch = Mock() sampled = pd.DataFrame({ 'column1': [28, 28], 'column2': [37, 37], 'column3': [93, 93], }) model._sample_batch.return_value = sampled model.fit(data) model._metadata = Mock() model._metadata.get_fields.return_value = ['column1', 'column2', 'column3'] model._metadata.transform.return_value = pd.DataFrame() model._metadata.make_ids_unique.side_effect = lambda x: x # Run output = model.sample(5, conditions=conditions, graceful_reject_sampling=True) # Assert expected_output = pd.DataFrame({ 'column1': [28, 28], 'column2': [37, 37], 'column3': [93, 93], }) _, args, kwargs = model._metadata.transform.mock_calls[0] pd.testing.assert_series_equal(args[0]['column1'], conditions_series) assert kwargs['on_missing_column'] == 'drop' model._metadata.transform.assert_called_once() model._sample_batch.assert_called_with(5, 100, 10, conditions, None, 0.01) pd.testing.assert_frame_equal(output, expected_output)
def test_fit_with_unique_constraint_on_data_which_has_index_column(): """Test that the ``fit`` method runs without error when metadata specifies unique constraint, ``fit`` is called on data containing a column named index and other columns. The ``fit`` method is expected to fit the model to data, taking into account the metadata and the ``Unique`` constraint. Setup: - The model is passed the unique constraint and the primary key column. - The unique constraint is set on the ``test_column`` Input: - Data, Unique constraint Github Issue: - Tests that https://github.com/sdv-dev/SDV/issues/616 does not occur """ # Setup test_df = pd.DataFrame({ "key": [ 1, 2, 3, 4, 5, ], "index": [ "A", "B", "C", "D", "E", ], "test_column": [ "A1", "B2", "C3", "D4", "E5", ] }) unique = Unique(column_names=["test_column"]) model = GaussianCopula(primary_key="key", constraints=[unique]) # Run model.fit(test_df) samples = model.sample(2) # Assert assert len(samples) == 2 assert samples["test_column"].is_unique
def test_conditional_sampling_two_conditions(): data = pd.DataFrame({ "column1": [1.0, 0.5, 2.5] * 10, "column2": ["a", "b", "c"] * 10, "column3": ["d", "e", "f"] * 10 }) model = GaussianCopula() model.fit(data) conditions = {"column2": "b", "column3": "f"} samples = model.sample(5, conditions=conditions) assert list(samples.column2) == ['b'] * 5 assert list(samples.column3) == ['f'] * 5
def test_conditional_sampling_dataframe(): data = pd.DataFrame({ "column1": [1.0, 0.5, 2.5] * 10, "column2": ["a", "b", "c"] * 10 }) model = GaussianCopula() model.fit(data) conditions = pd.DataFrame({"column2": ["b", "b", "b", "c", "c"]}) sampled = model.sample(conditions=conditions) assert sampled.shape[0] == len(conditions["column2"]) assert (sampled["column2"] == np.array(["b", "b", "b", "c", "c"])).all()
def test_conditional_sampling_dict(): data = pd.DataFrame({ "column1": [1.0, 0.5, 2.5] * 10, "column2": ["a", "b", "c"] * 10 }) model = GaussianCopula() model.fit(data) conditions = {"column2": "b"} sampled = model.sample(30, conditions=conditions) assert sampled.shape == data.shape assert set(sampled["column2"].unique()) == set(["b"])
def test_conditional_sampling_dict(): data = pd.DataFrame({ 'column1': [1.0, 0.5, 2.5] * 10, 'column2': ['a', 'b', 'c'] * 10 }) model = GaussianCopula() model.fit(data) conditions = [Condition({'column2': 'b'}, num_rows=30)] sampled = model.sample_conditions(conditions=conditions) assert sampled.shape == data.shape assert set(sampled['column2'].unique()) == set(['b'])
def test_conditional_sampling_two_conditions(): data = pd.DataFrame({ 'column1': [1.0, 0.5, 2.5] * 10, 'column2': ['a', 'b', 'c'] * 10, 'column3': ['d', 'e', 'f'] * 10 }) model = GaussianCopula() model.fit(data) conditions = [Condition({'column2': 'b', 'column3': 'f'}, num_rows=5)] samples = model.sample_conditions(conditions=conditions) assert list(samples.column2) == ['b'] * 5 assert list(samples.column3) == ['f'] * 5
def test_conditional_sampling_dataframe(): data = pd.DataFrame({ 'column1': [1.0, 0.5, 2.5] * 10, 'column2': ['a', 'b', 'c'] * 10 }) model = GaussianCopula() model.fit(data) conditions = pd.DataFrame({'column2': ['b', 'b', 'b', 'c', 'c']}) sampled = model.sample_remaining_columns(conditions) assert sampled.shape[0] == len(conditions['column2']) assert (sampled['column2'] == np.array(['b', 'b', 'b', 'c', 'c'])).all()
def test_conditional_sampling_numerical(): data = pd.DataFrame({ "column1": [1.0, 0.5, 2.5] * 10, "column2": ["a", "b", "c"] * 10, "column3": ["d", "e", "f"] * 10 }) model = GaussianCopula() model.fit(data) conditions = { "column1": 1.0, } sampled = model.sample(5, conditions=conditions) assert list(sampled.column1) == [1.0] * 5
def test_conditional_sampling_numerical(): data = pd.DataFrame({ 'column1': [1.0, 0.5, 2.5] * 10, 'column2': ['a', 'b', 'c'] * 10, 'column3': ['d', 'e', 'f'] * 10 }) model = GaussianCopula() model.fit(data) conditions = [Condition({ 'column1': 1.0, }, num_rows=5)] sampled = model.sample_conditions(conditions=conditions) assert list(sampled.column1) == [1.0] * 5
def test_fit_with_unique_constraint_on_data_subset(): """Test that the ``fit`` method runs without error when metadata specifies unique constraint, ``fit`` is called on a subset of the original data. The ``fit`` method is expected to fit the model to the subset of data, taking into account the metadata and the ``Unique`` constraint. Setup: - The model is passed a ``Unique`` constraint and is matched to a subset of the specified data. Subdividing the data results in missing indexes in the subset contained in the original data. Input: - Subset of data, unique constraint Github Issue: - Tests that https://github.com/sdv-dev/SDV/issues/610 does not occur """ # Setup test_df = pd.DataFrame({ "key": [ 1, 2, 3, 4, 5, ], "test_column": [ "A", "B", "C", "D", "E", ] }) unique = Unique(column_names=["test_column"]) test_df = test_df.iloc[[1, 3, 4]] model = GaussianCopula(primary_key="key", constraints=[unique]) # Run model.fit(test_df) samples = model.sample(2) # Assert assert len(samples) == 2 assert samples["test_column"].is_unique
def test_ids_only(): """Ensure that tables that do not contain anything other than id fields can be modeled.""" ids_only = pd.DataFrame({ 'id': range(10), 'other_id': range(10), }) model = GaussianCopula(field_types={ 'id': { 'type': 'id' }, 'other_id': { 'type': 'id' } }) model.fit(ids_only) sampled = model.sample() assert sampled.shape == ids_only.shape assert ids_only.equals(sampled)
def test_integer_categoricals(): """Ensure integer categoricals are still sampled as integers. The origin of this tests can be found in the github issue #194: https://github.com/sdv-dev/SDV/issues/194 """ users = load_demo(metadata=False)['users'] field_types = { 'age': { 'type': 'categorical', }, } gc = GaussianCopula(field_types=field_types, categorical_transformer='categorical') gc.fit(users) sampled = gc.sample() assert users['age'].dtype == np.int64 assert sampled['age'].dtype == np.int64
def test_recreate(): data = load_demo(metadata=False)['users'] # If distribution is non parametric, get_parameters fails model = GaussianCopula() model.fit(data) sampled = model.sample() assert sampled.shape == data.shape assert (sampled.dtypes == data.dtypes).all() assert (sampled.notnull().sum(axis=1) != 0).all() # Metadata model_meta = GaussianCopula(table_metadata=model.get_metadata()) model_meta.fit(data) sampled = model_meta.sample() assert sampled.shape == data.shape assert (sampled.dtypes == data.dtypes).all() assert (sampled.notnull().sum(axis=1) != 0).all() # Metadata dict model_meta_dict = GaussianCopula( table_metadata=model.get_metadata().to_dict()) model_meta_dict.fit(data) sampled = model_meta_dict.sample() assert sampled.shape == data.shape assert (sampled.dtypes == data.dtypes).all() assert (sampled.notnull().sum(axis=1) != 0).all()
def test_sample_batches_transform_conditions_correctly(): """Test that transformed conditions are batched correctly. The ``Sample`` method is expected to: - Return sampled data and call ``_sample_batch`` for every unique transformed condition group. Input: - Number of rows to sample - Conditions Output: - Sampled data """ # Setup model = GaussianCopula() data = pd.DataFrame({ 'column1': list(range(100)), 'column2': list(range(100)), 'column3': list(range(100)) }) conditions = {'column1': [25, 25, 25, 30, 30]} conditions_series = pd.Series([25, 25, 25, 30, 30], name='column1') model._sample_batch = Mock() expected_outputs = [ pd.DataFrame({ 'column1': [25, 25, 25], 'column2': [37, 37, 37], 'column3': [93, 93, 93], }), pd.DataFrame({ 'column1': [30], 'column2': [37], 'column3': [93], }), pd.DataFrame({ 'column1': [30], 'column2': [37], 'column3': [93], }) ] model._sample_batch.side_effect = expected_outputs model.fit(data) model._metadata = Mock() model._metadata.get_fields.return_value = ['column1', 'column2', 'column3'] model._metadata.transform.return_value = pd.DataFrame( [[50], [50], [50], [60], [70]], columns=['transformed_column']) # Run model.sample(5, conditions=conditions, graceful_reject_sampling=True) # Assert _, args, kwargs = model._metadata.transform.mock_calls[0] pd.testing.assert_series_equal(args[0]['column1'], conditions_series) assert kwargs['on_missing_column'] == 'drop' model._metadata.transform.assert_called_once() model._sample_batch.assert_any_call(3, 100, 10, {'column1': 25}, {'transformed_column': 50}, 0.01) model._sample_batch.assert_any_call(1, 100, 10, {'column1': 30}, {'transformed_column': 60}, 0.01) model._sample_batch.assert_any_call(1, 100, 10, {'column1': 30}, {'transformed_column': 70}, 0.01)
def test_conditional_sampling_constraint_uses_columns_model_reject_sampling( column_model_mock): """Test that the ``sample`` method handles constraints with conditions. The ``sample`` method is expected to properly apply constraint transformations by sampling the missing columns for the constraint if ``fit_columns_model`` is True. All values sampled by the column model should be valid because reject sampling is used on any that aren't. Setup: - The model is being passed a ``GreaterThan`` constraint and then asked to sample with one condition. One of the constraint columns is the conditioned column. The ``GaussianMultivariate`` class is mocked so that the constraint's ``_column_model`` returns some invalid rows in order to test that the reject sampling is used. Input: - Conditions Side Effects: - Correct columns to condition on are passed to underlying sample method """ # Setup constraint = GreaterThan(low='age_joined', high='age', handling_strategy='transform', fit_columns_model=True, drop='high') data = pd.DataFrame({ 'age_joined': [22.0, 21.0, 15.0, 18.0, 29.0], 'age': [27.0, 28.0, 26.0, 21.0, 30.0], 'experience_years': [6.0, 7.0, 11.0, 3.0, 7.0], }) model = GaussianCopula(constraints=[constraint]) sampled_conditions = [ pd.DataFrame({ 'age_joined': [26.0, 18.0, 31.0, 29.0, 32.0], 'age': [30.0, 30.0, 30.0, 30.0, 30.0] }), pd.DataFrame({ 'age_joined': [28.0, 33.0, 31.0], 'age': [30.0, 30.0, 30.0] }), pd.DataFrame({ 'age_joined': [27.0], 'age': [30.0] }) ] column_model_mock.return_value.sample.side_effect = sampled_conditions model.fit(data) # Run conditions = {'age': 30.0} sampled_data = model.sample(5, conditions=conditions) # Assert assert len(column_model_mock.return_value.sample.mock_calls) == 3 expected_result = pd.DataFrame({ 'age_joined': [26.0, 18.0, 29.0, 28.0, 27.0], 'age': [30.0, 30.0, 30.0, 30.0, 30.0] }) pd.testing.assert_frame_equal( sampled_data[['age_joined', 'age']], expected_result[['age_joined', 'age']], )
def test_gaussian_copula(): users = load_demo(metadata=False)['users'] field_types = { 'age': { 'type': 'numerical', 'subtype': 'integer', }, 'country': { 'type': 'categorical' } } anonymize_fields = {'country': 'country_code'} # If distribution is non parametric, get_parameters fails gc = GaussianCopula( field_names=['user_id', 'country', 'gender', 'age'], field_types=field_types, primary_key='user_id', anonymize_fields=anonymize_fields, field_distributions={'age': 'gamma'}, default_distribution='gaussian_kde', ) gc.fit(users) with pytest.raises(NonParametricError): parameters = gc.get_parameters() # If distribution is parametric, copula can be recreated gc = GaussianCopula( field_names=['user_id', 'country', 'gender', 'age'], field_types=field_types, primary_key='user_id', anonymize_fields=anonymize_fields, field_distributions={'age': 'gamma'}, default_distribution='bounded', ) gc.fit(users) parameters = gc.get_parameters() new_gc = GaussianCopula(table_metadata=gc.get_metadata(), ) new_gc.set_parameters(parameters) # Validate sampled dat sampled = new_gc.sample() # test shape is right assert sampled.shape == users.shape # test user_id has been generated as an ID field assert list(sampled['user_id']) == list(range(0, len(users))) # country codes have been replaced with new ones assert set(sampled.country.unique()) != set(users.country.unique()) # Validate metadata metadata = gc.get_metadata().to_dict() assert metadata['fields'] == { 'user_id': { 'type': 'id', 'subtype': 'integer', 'transformer': 'integer', }, 'country': { 'type': 'categorical', 'pii': True, 'pii_category': 'country_code', 'transformer': 'one_hot_encoding', }, 'gender': { 'type': 'categorical', 'transformer': 'one_hot_encoding', }, 'age': { 'type': 'numerical', 'subtype': 'integer', 'transformer': 'integer', } } assert 'model_kwargs' in metadata assert 'GaussianCopula' in metadata['model_kwargs']
def test_conditional_sampling_constraint_uses_columns_model(gm_mock): """Test that the ``sample`` method handles constraints with conditions. The ``sample`` method is expected to properly apply constraint transformations by sampling the missing columns for the constraint if ``fit_columns_model`` is True. Setup: - The model is being passed a ``UniqueCombination`` constraint and then asked to sample with two conditions, one of which the constraint depends on. The constraint will sample the columns it needs that are not present in the conditions and will then use constraint transformations to meet the requirements. Input: - Conditions Side Effects: - Correct columns to condition on are passed to underlying sample method """ # Setup constraint = UniqueCombinations( columns=['city', 'state'], handling_strategy='transform', fit_columns_model=True, ) data = pd.DataFrame({ 'city': ['LA', 'SF', 'CHI', 'LA', 'LA'], 'state': ['CA', 'CA', 'IL', 'CA', 'CA'], 'age': [27, 28, 26, 21, 30] }) model = GaussianCopula(constraints=[constraint], categorical_transformer='label_encoding') sampled_numeric_data = [ pd.DataFrame({ 'city#state': [2], 'age': [30] }), pd.DataFrame({ 'city#state': [1, 1, 0, 0, 0], 'age': [30, 30, 30, 30, 30] }), pd.DataFrame({ 'city#state': [0, 0, 1, 1, 1], 'age': [30, 30, 30, 30, 30] }) ] gm_mock.return_value.sample.side_effect = sampled_numeric_data model.fit(data) # Run conditions = {'age': 30, 'state': 'CA'} sampled_data = model.sample(5, conditions=conditions) # Assert expected_states = pd.Series(['CA', 'CA', 'CA', 'CA', 'CA'], name='state') expected_ages = pd.Series([30, 30, 30, 30, 30], name='age') sample_calls = model._model.sample.mock_calls assert len(sample_calls) >= 2 and len(sample_calls) <= 3 assert all(c[2]['conditions']['age'] == 30 for c in sample_calls) assert all('city#state' in c[2]['conditions'] for c in sample_calls) pd.testing.assert_series_equal(sampled_data['age'], expected_ages) pd.testing.assert_series_equal(sampled_data['state'], expected_states) assert all(c in ('SF', 'LA') for c in sampled_data['city'])