def test_sample_conditions(self): """Test ``sample_conditions`` method. Expect the correct args to be passed to ``_sample_conditions``. Input: - valid conditions Side Effects: - The expected ``_sample_conditions`` call. """ # Setup model = Mock(spec_set=GaussianCopula) condition = Condition( {'column1': 'b'}, num_rows=5, ) batch_size = 1 randomize_samples = False output_file_path = 'test.csv' # Run out = GaussianCopula.sample_conditions( model, [condition], batch_size=batch_size, randomize_samples=False, output_file_path=output_file_path, ) # Assert model._sample_conditions.assert_called_once_with( [condition], 100, batch_size, randomize_samples, output_file_path) assert out == model._sample_conditions.return_value
def test_conditional_sampling_constraint_uses_reject_sampling( gm_mock, isinstance_mock): """Test that the ``sample`` method handles constraints with conditions. The ``sample`` method is expected to properly apply constraint transformations by dropping columns that cannot be conditonally sampled on due to them being part of a constraint. Setup: - The model is being passed a ``UniqueCombination`` constraint and then asked to sample with two conditions, one of which the constraint depends on. The constraint is expected to skip its transformations since only some of the columns are provided by the conditions and the model will use reject sampling to meet the constraint instead. Input: - Conditions Side Effects: - Correct columns to condition on are passed to underlying sample method """ # Setup isinstance_mock.side_effect = _isinstance_side_effect constraint = FixedCombinations(column_names=['city', 'state']) data = pd.DataFrame({ 'city': ['LA', 'SF', 'CHI', 'LA', 'LA'], 'state': ['CA', 'CA', 'IL', 'CA', 'CA'], 'age': [27, 28, 26, 21, 30] }) model = GaussianCopula(constraints=[constraint], categorical_transformer='label_encoding') sampled_numeric_data = [ pd.DataFrame({ 'city#state.value': [0, 1, 2, 0, 0], 'age.value': [30, 30, 30, 30, 30] }), pd.DataFrame({ 'city#state.value': [1], 'age.value': [30] }) ] gm_mock.return_value.sample.side_effect = sampled_numeric_data model.fit(data) # Run conditions = [Condition({'age': 30, 'state': 'CA'}, num_rows=5)] sampled_data = model.sample_conditions(conditions=conditions) # Assert expected_transformed_conditions = {'age.value': 30} expected_data = pd.DataFrame({ 'city': ['LA', 'SF', 'LA', 'LA', 'SF'], 'state': ['CA', 'CA', 'CA', 'CA', 'CA'], 'age': [30, 30, 30, 30, 30] }) sample_calls = model._model.sample.mock_calls assert len(sample_calls) == 2 model._model.sample.assert_any_call( 50, conditions=expected_transformed_conditions) pd.testing.assert_frame_equal(sampled_data, expected_data)
def test_conditional_sampling_two_conditions(): data = pd.DataFrame({ 'column1': [1.0, 0.5, 2.5] * 10, 'column2': ['a', 'b', 'c'] * 10, 'column3': ['d', 'e', 'f'] * 10 }) model = GaussianCopula() model.fit(data) conditions = [Condition({'column2': 'b', 'column3': 'f'}, num_rows=5)] samples = model.sample_conditions(conditions=conditions) assert list(samples.column2) == ['b'] * 5 assert list(samples.column3) == ['f'] * 5
def test_conditional_sampling_dict(): data = pd.DataFrame({ 'column1': [1.0, 0.5, 2.5] * 10, 'column2': ['a', 'b', 'c'] * 10 }) model = GaussianCopula() model.fit(data) conditions = [Condition({'column2': 'b'}, num_rows=30)] sampled = model.sample_conditions(conditions=conditions) assert sampled.shape == data.shape assert set(sampled['column2'].unique()) == set(['b'])
def test_conditional_sampling_numerical(): data = pd.DataFrame({ 'column1': [1.0, 0.5, 2.5] * 10, 'column2': ['a', 'b', 'c'] * 10, 'column3': ['d', 'e', 'f'] * 10 }) model = GaussianCopula() model.fit(data) conditions = [Condition({ 'column1': 1.0, }, num_rows=5)] sampled = model.sample_conditions(conditions=conditions) assert list(sampled.column1) == [1.0] * 5