def test__validate_constraint_columns_warning(self): """Test the ``Constraint._validate_constraint_columns`` method. Expect that ``_validate_constraint_columns`` throws a warning when missing columns and not using columns model. Setup: - Mock the constraint columns to have one more column than the table_data: ('a', 'b'). - Mock the ``_columns_model`` to be False. - Mock the ``_sample_constraint_columns`` to return a dataframe. Input: - table_data with one column ('a'). Output: - table_data Side Effects: - A UserWarning is thrown. """ # Setup constraint = Mock() constraint.constraint_columns = ['a', 'b'] constraint._columns_model = False constraint._sample_constraint_columns.return_value = pd.DataFrame( {'a': [0, 1, 2]}) table_data = pd.DataFrame({'a': [0, 1, 2]}) # Run and assert with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') Constraint._validate_constraint_columns(constraint, table_data) assert len(w) == 1 assert issubclass(w[0].category, UserWarning)
def test_fit_trains_column_model(self, ht_mock, gm_mock): """Test the ``Constraint.fit`` method trains the column model. When ``fit_columns_model`` is True and there are multiple ``constraint_columns``, the ``Constraint.fit`` method is expected to: - Call ``_fit`` method. - Create ``_hyper_transformer``. - Create ``_column_model`` and train it. Input: - Table data (pandas.DataFrame) """ # Setup table_data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) instance = Constraint(handling_strategy='transform', fit_columns_model=True) instance.constraint_columns = ('a', 'b') # Run instance.fit(table_data) # Assert gm_mock.return_value.fit.assert_called_once() calls = ht_mock.return_value.fit_transform.mock_calls args = calls[0][1] assert len(calls) == 1 pd.testing.assert_frame_equal(args[0], table_data)
def test_transform__transform_errors(self): """Test that the ``transform`` method handles any errors. If the ``_transform`` method raises an error, the error should be raised. Setup: - Make ``_transform`` raise an error. Input: - ``pandas.DataFrame``. Output: - Same ``pandas.DataFrame``. Side effects: - Exception should be raised """ # Setup instance = Constraint() instance._transform = Mock() instance._transform.side_effect = Exception() data = pd.DataFrame({'a': [1, 2, 3]}) # Run / Assert with pytest.raises(Exception): instance.transform(data)
def test__prepare_constraints_invalid_order_raises_exception( self, from_dict_mock): """Test the ``_prepare_constraints`` method validates the constraint order. If one constraint has ``rebuild_columns`` that are in a later constraint's ``constraint_columns``, an exception should be raised. Input: - List of constraints with some having ``rebuild_columns`` that are in a later constraint's ``constraint_columns``. Side Effect: - Exception should be raised. """ # Setup constraint1 = Constraint(handling_strategy='reject_sampling') constraint2 = Constraint(handling_strategy='reject_sampling') constraint3 = Constraint(handling_strategy='transform') constraint4 = Constraint(handling_strategy='transform') constraints = [constraint1, constraint2, constraint3, constraint4] constraint3.rebuild_columns = ['a', 'd'] constraint4.constraint_columns = ['a', 'b', 'c'] constraint4.rebuild_columns = ['a'] from_dict_mock.side_effect = [ constraint1, constraint2, constraint3, constraint4 ] # Run with pytest.raises(Exception): Table._prepare_constraints(constraints)
def test__prepare_constraints_sorts_constraints_none_rebuild_columns( self, from_dict_mock): """Test that ``_prepare_constraints`` method sorts constraints. The ``_prepare_constraints`` method should sort constraints with None as ``rebuild_columns`` before those that have them. Input: - list of constraints with some having None as ``rebuild_columns`` listed after those with ``rebuild_columns``. Output: - List of constraints sorted properly. """ # Setup constraint1 = Constraint(handling_strategy='transform') constraint2 = Constraint(handling_strategy='transform') constraint3 = Constraint(handling_strategy='reject_sampling') constraints = [constraint1, constraint2, constraint3] constraint1.rebuild_columns = ['a'] constraint2.rebuild_columns = ['b'] constraint3.rebuild_columns = None from_dict_mock.side_effect = [constraint1, constraint2, constraint3] # Run sorted_constraints = Table._prepare_constraints(constraints) # Asserts assert sorted_constraints == [constraint3, constraint1, constraint2]
def test__prepare_constraints_validates_constraint_order( self, from_dict_mock): """Test the ``_prepare_constraints`` method validates the constraint order. If no constraint has ``rebuild_columns`` that are in a later constraint's ``constraint_columns``, no exception should be raised. Input: - List of constraints with none having ``rebuild_columns`` that are in a later constraint's ``constraint_columns``. Output: - Sorted list of constraints. """ # Setup constraint1 = Constraint(handling_strategy='reject_sampling') constraint2 = Constraint(handling_strategy='reject_sampling') constraint3 = Constraint(handling_strategy='transform') constraint4 = Constraint(handling_strategy='transform') constraints = [constraint1, constraint2, constraint3, constraint4] constraint3.rebuild_columns = ['e', 'd'] constraint4.constraint_columns = ['a', 'b', 'c'] constraint4.rebuild_columns = ['a'] from_dict_mock.side_effect = [ constraint1, constraint2, constraint3, constraint4 ] # Run sorted_constraints = Table._prepare_constraints(constraints) # Assert assert sorted_constraints == constraints
def test_fit(self): """Test the ``Constraint.fit`` method. The ``Constraint.fit`` method is a no-op method, so nothing needs to happen. We just call the method to certify that the interface is right. Input: - Table data (pandas.DataFrame) """ # Setup table_data = pd.DataFrame({'a': [1, 2, 3]}) # Run instance = Constraint(handling_strategy='transform') instance.fit(table_data)
def test__identity(self): """Test ```Constraint._identity`` method. ``_identity`` method should return whatever it is passed. Input: - anything Output: - Input """ # Run instance = Constraint('all') output = instance._identity('input') # Asserts assert output == 'input'
def test_transform(self): """Test the ``Constraint.transform`` method. It is an identity method for completion, to be optionally overwritten by subclasses. The ``Constraint.transform`` method is expected to: - Return the input data unmodified. Input: - Anything Output: - Input """ # Run instance = Constraint(handling_strategy='transform') output = instance.transform('input') # Assert assert output == 'input'
def _prepare_constraints(constraints): constraints = constraints or [] rebuild_columns = set() transform_constraints = [] reject_sampling_constraints = [] for constraint in constraints: if isinstance(constraint, type): constraint = constraint().to_dict() elif isinstance(constraint, Constraint): constraint = constraint.to_dict() constraint = Constraint.from_dict(constraint) if not constraint.rebuild_columns: reject_sampling_constraints.append(constraint) elif rebuild_columns & set(constraint.constraint_columns): intersecting_columns = rebuild_columns & set( constraint.constraint_columns) raise Exception( 'Multiple constraints will modify the same column(s): ' f'"{intersecting_columns}", which may lead to the constraint ' 'being unenforceable. Please use "reject_sampling" as the ' '"handling_strategy" instead.') else: transform_constraints.append(constraint) rebuild_columns.update(constraint.rebuild_columns) return reject_sampling_constraints + transform_constraints
def test_fit_transform(self): """Test the ``Constraint.fit_transform`` method. The ``Constraint.fit_transform`` method is expected to: - Call the ``fit`` method. - Call the ``transform`` method. - Return the input data unmodified. Input: - Anything Output: - self.transform output Side Effects: - self.fit is called with input - self.transform is called with input """ # Setup constraint_mock = Mock() constraint_mock.transform.return_value = 'the_transformed_data' # Run data = 'my_data' output = Constraint.fit_transform(constraint_mock, data) # Assert assert output == 'the_transformed_data' constraint_mock.fit.assert_called_once_with('my_data') constraint_mock.transform.assert_called_once_with('my_data')
def test_filter_valid_with_invalid_index(self): """Test the ``Constraint.filter_valid`` method. Tests when the is_valid method returns a Series with an invalid index. Note: `is_valid.index` can be [0, 1, 5] if, for example, the Series is a subset of an original table with 10 rows, but only rows 0/1/5 were selected. Input: - Table data (pandas.DataFrame) Output: - Table data, with only the valid rows (pandas.DataFrame) """ # Setup table_data = pd.DataFrame({'a': [1, 2, 3]}) constraint_mock = Mock() is_valid = pd.Series([True, True, False]) is_valid.index = [0, 1, 5] constraint_mock.is_valid.return_value = is_valid # Run out = Constraint.filter_valid(constraint_mock, table_data) # Assert expected_out = pd.DataFrame({'a': [1, 2]}) pd.testing.assert_frame_equal(expected_out, out)
def test_transform_model_enabled_reject_sampling_error(self): """Test that the ``Constraint.transform`` method raises an error appropriately. If the column model is used but doesn't return valid rows, reject sampling should be used to get the valid rows. If it doesn't get any valid rows in 100 tries, a ``ValueError`` is raised. Setup: - The ``_columns_model`` is fixed to always return an empty ``DataFrame``. Input: - Table with some missing columns. Side Effect: - ``ValueError`` raised. """ # Setup instance = Constraint(handling_strategy='transform') instance.constraint_columns = ('a', 'b') instance._hyper_transformer = Mock() instance._columns_model = Mock() transformed_conditions = pd.DataFrame([[1]], columns=['b']) instance._columns_model.sample.return_value = pd.DataFrame() instance._hyper_transformer.transform.return_value = transformed_conditions instance._hyper_transformer.reverse_transform.return_value = pd.DataFrame( ) # Run / Assert data = pd.DataFrame([[1, 2], [3, 4]], columns=['b', 'c']) with pytest.raises(ValueError): instance.transform(data)
def test_fit(self): """Test the ``Constraint.fit`` method. The base ``Constraint.fit`` method is expected to: - Call ``_fit`` method. Input: - Table data (pandas.DataFrame) """ # Setup table_data = pd.DataFrame({'a': [1, 2, 3]}) instance = Constraint(handling_strategy='transform', fit_columns_model=False) instance._fit = Mock() # Run instance.fit(table_data) # Assert instance._fit.assert_called_once_with(table_data)
def test_fit_gaussian_multivariate_correct_distribution(self, gm_mock): """Test the ``GaussianMultivariate`` from the ``Constraint.fit`` method. The ``GaussianMultivariate`` is expected to be called with default distribution set as ``GaussianUnivariate``. Input: - Table data (pandas.DataFrame) """ # Setup table_data = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]}) instance = Constraint(handling_strategy='transform', fit_columns_model=True) instance.constraint_columns = ('a', 'b') # Run instance.fit(table_data) # Assert gm_mock.assert_called_once_with(distribution=GaussianUnivariate)
def test__validate_data_meets_constraints_invalid_input(self): """Test the ``_validate_data_meets_constraint`` method. Expect that the method raises an error when the constraint columns are in the given data and the ``is_valid`` returns False for any row. Input: - Table data contains an invalid row Output: - None Side Effects: - A ``ConstraintsNotMetError`` is thrown """ # Setup data = pd.DataFrame( { 'a': [0, 1, 2, 3, 4, 5, 6, 7], 'b': [3, 4, 5, 6, 7, 8, 9, 10] }, index=[0, 1, 2, 3, 4, 5, 6, 7]) constraint = Constraint() constraint.constraint_columns = ['a', 'b'] is_valid_result = pd.Series( [True, False, True, False, False, False, False, False]) constraint.is_valid = Mock(return_value=is_valid_result) # Run / Assert error_message = re.escape( "Data is not valid for the 'Constraint' constraint:\n " 'a b\n1 1 4\n3 3 6\n4 4 7\n5 5 8\n6 6 9' '\n+1 more') with pytest.raises(ConstraintsNotMetError, match=error_message): constraint._validate_data_meets_constraint(data)
def _fit_transform_constraints(self, data): for idx, constraint in enumerate(self._constraints): if isinstance(constraint, type): constraint = constraint().to_dict() elif isinstance(constraint, Constraint): constraint = constraint.to_dict() constraint = Constraint.from_dict(constraint) self._constraints[idx] = constraint data = constraint.fit_transform(data) return data
def test_is_valid(self): """Test the ``Constraint.is_valid` method. This should be overwritten by all the subclasses that have a way to decide which rows are valid and which are not. The ``Constraint.is_valid`` method is expected to: - Say whether the given table rows are valid. Input: - Table data (pandas.DataFrame) Output: - Series of ``True`` values (pandas.Series) """ # Setup table_data = pd.DataFrame({'a': [1, 2, 3]}) # Run instance = Constraint(handling_strategy='transform') out = instance.is_valid(table_data) # Assert expected_out = pd.Series([True, True, True]) pd.testing.assert_series_equal(expected_out, out)
def test_reverse_transform(self): """Test the ``Constraint.reverse_transform`` method. It is an identity method for completion, to be optionally overwritten by subclasses. The ``Constraint.reverse_transform`` method is expected to: - Return a copy of the input data. Input: - Anything Output: - Input """ # Setup instance = Constraint() data = pd.DataFrame() # Run output = instance.reverse_transform(data) # Assert pd.testing.assert_frame_equal(output, pd.DataFrame()) assert id(output) != id(data)
def test___init___not_kown(self): """Test ``Constraint.__init__`` method when a not known ``handling_strategy`` is passed. If a not known ``handling_strategy`` is given, a ValueError is raised. Input: - not_known Side effects: - ValueError """ # Run with pytest.raises(ValueError): Constraint(handling_strategy='not_known')
def test_transform_model_enabled_some_columns_missing(self): """Test that the ``Constraint.transform`` method uses column model. If ``table_data`` is missing some of the ``constraint_columns``, the ``_column_model`` should be used to sample the rest and the data should be transformed. Input: - Table with some missing columns. Output: - Transformed data with all columns. """ # Setup instance = Constraint(handling_strategy='transform') instance._transform = lambda x: x instance.constraint_columns = ('a', 'b') instance._hyper_transformer = Mock() instance._columns_model = Mock() conditions = [ pd.DataFrame([[5, 1, 2]], columns=['a', 'b', 'c']), pd.DataFrame([[6, 3, 4]], columns=['a', 'b', 'c']) ] transformed_conditions = [ pd.DataFrame([[1]], columns=['b']), pd.DataFrame([[3]], columns=['b']) ] instance._columns_model.sample.return_value = pd.DataFrame( [[1, 2, 3]], columns=['b', 'c', 'a']) instance._hyper_transformer.transform.side_effect = transformed_conditions instance._hyper_transformer.reverse_transform.side_effect = conditions # Run data = pd.DataFrame([[1, 2], [3, 4]], columns=['b', 'c']) transformed_data = instance.transform(data) # Assert expected_tranformed_data = pd.DataFrame([[1, 2, 3]], columns=['b', 'c', 'a']) expected_result = pd.DataFrame([[5, 1, 2], [6, 3, 4]], columns=['a', 'b', 'c']) model_calls = instance._columns_model.sample.mock_calls assert len(model_calls) == 2 instance._columns_model.sample.assert_any_call(num_rows=1, conditions={'b': 1}) instance._columns_model.sample.assert_any_call(num_rows=1, conditions={'b': 3}) reverse_transform_calls = instance._hyper_transformer.reverse_transform.mock_calls pd.testing.assert_frame_equal(reverse_transform_calls[0][1][0], expected_tranformed_data) pd.testing.assert_frame_equal(reverse_transform_calls[1][1][0], expected_tranformed_data) pd.testing.assert_frame_equal(transformed_data, expected_result)
def test_transform(self): """Test the ``Constraint.transform`` method. By default, it behaves like an identity method, to be optionally overwritten by subclasses. The ``Constraint.transform`` method is expected to: - Return a copy of the input data. Input: - a DataFrame Output: - Input """ # Setup instance = Constraint() data = pd.DataFrame({'col': ['input']}) # Run output = instance.transform(data) # Assert pd.testing.assert_frame_equal(output, pd.DataFrame({'col': ['input']})) assert id(output) != id(data)
def test_transform_model_enabled_reject_sampling_duplicates_valid_rows( self): """Test the ``Constraint.transform`` method's reject sampling fall back. If the column model is used but doesn't return valid rows, reject sampling should be used to get the valid rows. If after 100 tries, some valid rows are created but not enough, then the valid rows are duplicated to meet the ``num_rows`` requirement. Setup: - The ``_columns_model`` returns some valid rows the first time, and then an empy ``DataFrame`` for every other call. Input: - Table with some missing columns. Output: - Transformed data with all columns. """ # Setup instance = Constraint(handling_strategy='transform') instance._transform = lambda x: x instance.constraint_columns = ('a', 'b') instance._hyper_transformer = Mock() instance._columns_model = Mock() transformed_conditions = [ pd.DataFrame([[1], [1], [1], [1], [1]], columns=['b']) ] instance._columns_model.sample.side_effect = [ pd.DataFrame([[1, 2], [1, 3]], columns=['a', 'b']) ] + [pd.DataFrame()] * 100 instance._hyper_transformer.transform.side_effect = transformed_conditions instance._hyper_transformer.reverse_transform = lambda x: x # Run data = pd.DataFrame([[1], [1], [1], [1], [1]], columns=['b']) transformed_data = instance.transform(data) # Assert expected_result = pd.DataFrame( [[1, 2], [1, 3], [1, 2], [1, 3], [1, 2]], columns=['a', 'b']) model_calls = instance._columns_model.sample.mock_calls assert len(model_calls) == 101 instance._columns_model.sample.assert_any_call(num_rows=5, conditions={'b': 1}) pd.testing.assert_frame_equal(transformed_data, expected_result)
def test___init___all(self): """Test ``Constraint.__init__`` method when 'all' is passed. If 'all' is given, the ``__init__`` method should leave ``transform``, ``reverse_transform`` and ``is_valid`` untouched. Input: - all Side effects: - is_valid != identity - transform != identity - reverse_transform != identity """ # Run instance = Constraint(handling_strategy='all') # Asserts assert instance.filter_valid != instance._identity assert instance.transform != instance._identity assert instance.reverse_transform != instance._identity
def test___init___reject_sampling(self): """Test ``Constraint.__init__`` method when 'reject_sampling' is passed. If 'reject_sampling' is given, the ``__init__`` method should replace the ``transform`` and ``reverse_transform`` methods with an identity and leave ``is_valid`` untouched. Input: - reject_sampling Side effects: - is_valid != identity - transform == identity - reverse_transform == identity """ # Run instance = Constraint(handling_strategy='reject_sampling') # Asserts assert instance.filter_valid != instance._identity assert instance.transform == instance._identity assert instance.reverse_transform == instance._identity
def test_transform_invalid_table_data(self): """Test the ``Constraint.transform`` method. If ``table_data`` is invalid, it should raise an ``MissingConstraintColumnError``. The ``Constraint.transform`` method is expected to: - Raise ``MissingConstraintColumnError``. """ # Run instance = Constraint(handling_strategy='transform') instance._transform = lambda x: x instance._constraint_columns = ('a') # Assert with pytest.raises(MissingConstraintColumnError): instance.transform(pd.DataFrame())
def test_transform_model_enabled_reject_sampling(self): """Test the ``Constraint.transform`` method's reject sampling. If the column model is used but doesn't return valid rows, reject sampling should be used to get the valid rows. Setup: - The ``_columns_model`` returns some valid_rows the first time, and then the rest with the next call. Input: - Table with some missing columns. Output: - Transformed data with all columns. """ # Setup instance = Constraint(handling_strategy='transform') instance._transform = lambda x: x instance.constraint_columns = ('a', 'b') instance._hyper_transformer = Mock() instance._columns_model = Mock() transformed_conditions = [ pd.DataFrame([[1], [1], [1], [1], [1]], columns=['b']) ] instance._columns_model.sample.side_effect = [ pd.DataFrame([[1, 2], [1, 3]], columns=['a', 'b']), pd.DataFrame([[1, 4], [1, 5], [1, 6], [1, 7]], columns=['a', 'b']), ] instance._hyper_transformer.transform.side_effect = transformed_conditions instance._hyper_transformer.reverse_transform = lambda x: x # Run data = pd.DataFrame([[1], [1], [1], [1], [1]], columns=['b']) transformed_data = instance.transform(data) # Assert expected_result = pd.DataFrame( [[1, 2], [1, 3], [1, 4], [1, 5], [1, 6]], columns=['a', 'b']) model_calls = instance._columns_model.sample.mock_calls assert len(model_calls) == 2 instance._columns_model.sample.assert_any_call(num_rows=5, conditions={'b': 1}) assert model_calls[1][2]['num_rows'] > 3 pd.testing.assert_frame_equal(transformed_data, expected_result)
def test_transform_calls__transform(self): """Test the ``Constraint.transform`` method. It calls ``_transform`` if ``_validate_columns`` returns True. The ``Constraint.transform`` method is expected to: - Return value returned by ``_transform``. Input: - Anything Output: - Result of ``_transform(input)`` """ # Setup constraint_mock = Mock() constraint_mock._transform.return_value = 'the_transformed_data' constraint_mock._validate_columns.return_value = True # Run output = Constraint.transform(constraint_mock, 'input') # Assert assert output == 'the_transformed_data'
def test_transform_calls__transform(self): """Test that the ``Constraint.transform`` method calls ``_transform``. The ``Constraint.transform`` method is expected to: - Return value returned by ``_transform``. Input: - Anything Output: - Result of ``_transform(input)`` """ # Setup constraint_mock = Mock() constraint_mock.constraint_columns = [] constraint_mock._transform.return_value = 'the_transformed_data' # Run output = Constraint.transform(constraint_mock, pd.DataFrame()) # Assert assert output == 'the_transformed_data'
def test_transform_all_columns_missing(self): """Test the ``Constraint.transform`` method with all columns missing. If ``table_data`` is missing all of the ``constraint_columns`` a ``MissingConstraintColumnError`` is raised. The ``Constraint.transform`` method is expected to: - Raise ``MissingConstraintColumnError``. """ # Run instance = Constraint() instance._transform = lambda x: x instance.constraint_columns = ('a', ) # Assert with pytest.raises(MissingConstraintColumnError): instance.transform(pd.DataFrame())