def test_convert_column_name_value_to_id_multiple(self): ohe = Mock() ohe.transform.return_value = np.array( [[0, 1, 0] # one hot encoding, second dimension ]) transformer = DataTransformer() transformer._column_transform_info_list = [ ColumnTransformInfo( column_name='x', column_type='continuous', transform=None, transform_aux=None, output_info=[SpanInfo(1, 'tanh'), SpanInfo(3, 'softmax')], output_dimensions=1 + 3), ColumnTransformInfo(column_name='y', column_type='discrete', transform=ohe, transform_aux=None, output_info=[SpanInfo(2, 'softmax')], output_dimensions=2), ColumnTransformInfo(column_name='z', column_type='discrete', transform=ohe, transform_aux=None, output_info=[SpanInfo(2, 'softmax')], output_dimensions=2) ] result = transformer.convert_column_name_value_to_id('z', 'yes') assert result['column_id'] == 2 # this is the 3rd column assert result[ 'discrete_column_id'] == 1 # this is the 2nd discrete column assert result[ 'value_id'] == 1 # this is the 1st dimension in the one hot encoding
def test__apply_activate_(self): """Test `_apply_activate` for tables with both continuous and categoricals. Check every continuous column has all values between -1 and 1 (since they are normalized), and check every categorical column adds up to 1. Setup: - Mock `self._transformer.output_info_list` Input: - data = tensor of shape (N, data_dims) Output: - tensor = tensor of shape (N, data_dims) """ model = CTGANSynthesizer() model._transformer = Mock() model._transformer.output_info_list = [[SpanInfo( 3, 'softmax')], [SpanInfo(1, 'tanh'), SpanInfo(2, 'softmax')]] data = torch.randn(100, 6) result = model._apply_activate(data) assert result.shape == (100, 6) _assert_is_between(result[:, 0:3], 0.0, 1.0) _assert_is_between(result[:3], -1.0, 1.0) _assert_is_between(result[:, 4:6], 0.0, 1.0)
def test_fit(self): """Test 'fit' on a np.ndarray with one continuous and one discrete columns. The 'fit' method should: - Set 'self.dataframe' to 'False' - Set 'self._column_raw_dtypes' to the appropirate dtypes - Use the appropriate '_fit' type for each column' - Update 'self.output_info_list', 'self.output_dimensions' and 'self._column_transform_info_list' appropriately Setup: - Create DataTransformer - Mock _fit_discrete - Mock _fit_continuous Input: - raw_data = a table with one continuous and one discrete columns. - discrete_columns = list with the name of the discrete column Output: - None Side Effects: - _fit_discrete and _fit_continuous should each be called once - Assigns 'self._column_raw_dtypes' the appropriate dtypes - Assigns 'self.output_info_list' the appropriate 'output_info'. - Assigns 'self.output_dimensions' the appropriate 'output_dimensions'. - Assigns 'self._column_transform_info_list' the appropriate 'column_transform_info'. """ data = pd.DataFrame({ "x": np.random.random(size=100), "y": np.random.choice(["yes", "no"], size=100) }) transformer = DataTransformer() transformer._fit_continuous = Mock() transformer._fit_continuous.return_value = ColumnTransformInfo( column_name="x", column_type="continuous", transform=None, transform_aux=None, output_info=[SpanInfo(1, 'tanh'), SpanInfo(3, 'softmax')], output_dimensions=1 + 3) transformer._fit_discrete = Mock() transformer._fit_discrete.return_value = ColumnTransformInfo( column_name="y", column_type="discrete", transform=None, transform_aux=None, output_info=[SpanInfo(2, 'softmax')], output_dimensions=2) transformer.fit(data, discrete_columns=["y"]) transformer._fit_discrete.assert_called_once() transformer._fit_continuous.assert_called_once() assert transformer.output_dimensions == 6
def test_fit(self): """Test ``fit`` on a np.ndarray with one continuous and one discrete columns. The ``fit`` method should: - Set ``self.dataframe`` to ``False``. - Set ``self._column_raw_dtypes`` to the appropirate dtypes. - Use the appropriate ``_fit`` type for each column. - Update ``self.output_info_list``, ``self.output_dimensions`` and ``self._column_transform_info_list`` appropriately. Setup: - Create ``DataTransformer``. - Mock ``_fit_discrete``. - Mock ``_fit_continuous``. Input: - A table with one continuous and one discrete columns. - A list with the name of the discrete column. Side Effects: - ``_fit_discrete`` and ``_fit_continuous`` should each be called once. - Assigns ``self._column_raw_dtypes`` the appropriate dtypes. - Assigns ``self.output_info_list`` the appropriate ``output_info``. - Assigns ``self.output_dimensions`` the appropriate ``output_dimensions``. - Assigns ``self._column_transform_info_list`` the appropriate ``column_transform_info``. """ # Setup transformer = DataTransformer() transformer._fit_continuous = Mock() transformer._fit_continuous.return_value = ColumnTransformInfo( column_name='x', column_type='continuous', transform=None, output_info=[SpanInfo(1, 'tanh'), SpanInfo(3, 'softmax')], output_dimensions=1 + 3) transformer._fit_discrete = Mock() transformer._fit_discrete.return_value = ColumnTransformInfo( column_name='y', column_type='discrete', transform=None, output_info=[SpanInfo(2, 'softmax')], output_dimensions=2) data = pd.DataFrame({ 'x': np.random.random(size=100), 'y': np.random.choice(['yes', 'no'], size=100) }) # Run transformer.fit(data, discrete_columns=['y']) # Assert transformer._fit_discrete.assert_called_once() transformer._fit_continuous.assert_called_once() assert transformer.output_dimensions == 6
def test__cond_loss(self): """Test `_cond_loss`. Test that the loss is purely a function of the target categorical. Setup: - mock transformer.output_info_list - create two categoricals, one continuous - compute the conditional loss, conditioned on the 1st categorical - compare the loss to the cross-entropy of the 1st categorical, manually computed Input: data - the synthetic data generated by the model c - a tensor with the same shape as the data but with only a specific one-hot vector corresponding to the target column filled in m - binary mask used to select the categorical column to condition on Output: loss scalar; this should only be affected by the target column Note: - even though the implementation of this is probably right, I'm not sure if the idea behind it is correct """ model = CTGANSynthesizer() model._transformer = Mock() model._transformer.output_info_list = [ [SpanInfo(1, 'tanh'), SpanInfo(2, 'softmax')], [SpanInfo(3, 'softmax') ], # this is the categorical column we are conditioning on [SpanInfo(2, 'softmax') ], # this is the categorical column we are bry jrbec on ] data = torch.tensor([ # first 3 dims ignored, next 3 dims are the prediction, last 2 dims are ignored [0.0, -1.0, 0.0, 0.05, 0.05, 0.9, 0.1, 0.4], ]) c = torch.tensor([ # first 3 dims are a one-hot for the categorical, # next 2 are for a different categorical that we are not conditioning on # (continuous values are not stored in this tensor) [0.0, 0.0, 1.0, 0.0, 0.0], ]) # this indicates that we are conditioning on the first categorical m = torch.tensor([[1, 0]]) result = model._cond_loss(data, c, m) expected = torch.nn.functional.cross_entropy( torch.tensor([ [0.05, 0.05, 0.9], # 3 categories, one hot ]), torch.tensor([2])) assert (result - expected).abs() < 1e-3
def test_convert_column_name_value_to_id(self): """Test ``convert_column_name_value_to_id`` on a simple ``_column_transform_info_list``. Tests that the appropriate indexes are returned when a table of three columns, discrete, continuous, discrete, is passed as '_column_transform_info_list'. Setup: - Mock ``_column_transform_info_list``. Input: - column_name = the name of a discrete column - value = the categorical value Output: - dictionary containing: - ``discrete_column_id`` = the index of the target column, when considering only discrete columns - ``column_id`` = the index of the target column (e.g. 3 = the third column in the data) - ``value_id`` = the index of the indicator value in the one-hot encoding """ # Setup ohe = Mock() ohe.transform.return_value = pd.DataFrame( [[0, 1] # one hot encoding, second dimension ]) transformer = DataTransformer() transformer._column_transform_info_list = [ ColumnTransformInfo( column_name='x', column_type='continuous', transform=None, output_info=[SpanInfo(1, 'tanh'), SpanInfo(3, 'softmax')], output_dimensions=1 + 3), ColumnTransformInfo(column_name='y', column_type='discrete', transform=ohe, output_info=[SpanInfo(2, 'softmax')], output_dimensions=2) ] # Run result = transformer.convert_column_name_value_to_id('y', 'yes') # Assert assert result['column_id'] == 1 # this is the 2nd column assert result[ 'discrete_column_id'] == 0 # this is the 1st discrete column assert result[ 'value_id'] == 1 # this is the 2nd dimension in the one hot encoding
def test__transform_continuous(self, MockBGM): """Test ``_transform_continuous``. Setup: - Mock the ``BayesGMMTransformer`` with the transform method returning some dataframe. - Create ``DataTransformer``. Input: - ``ColumnTransformInfo`` object. - A dataframe containing a continuous column. Output: - A np.array where the first column contains the normalized part of the mocked transform, and the other columns are a one hot encoding representation of the component part of the mocked transform. """ # Setup bgm_instance = MockBGM.return_value bgm_instance.transform.return_value = pd.DataFrame({ 'x.normalized': [0.1, 0.2, 0.3], 'x.component': [0.0, 1.0, 1.0] }) transformer = DataTransformer() data = pd.DataFrame({'x': np.array([0.1, 0.3, 0.5])}) column_transform_info = ColumnTransformInfo( column_name='x', column_type='continuous', transform=bgm_instance, output_info=[SpanInfo(1, 'tanh'), SpanInfo(3, 'softmax')], output_dimensions=1 + 3) # Run result = transformer._transform_continuous(column_transform_info, data) # Assert expected = np.array([ [0.1, 1, 0, 0], [0.2, 0, 1, 0], [0.3, 0, 1, 0], ]) np.testing.assert_array_equal(result, expected)
def test_transform(self): """Test 'transform' on a dataframe with one continuous and one discrete columns. It should use the appropriate '_transform' type for each column and should return them concanenated appropriately. Setup: - Mock _column_transform_info_list - Mock _transform_discrete - Mock _transform_continuous Input: - raw_data = a table with one continuous and one discrete columns. Output: - numpy array containing the transformed two columns Side Effects: - _transform_discrete and _transform_continuous should each be called once. """ data = pd.DataFrame({ "x": np.array([0.1, 0.3, 0.5]), "y": np.array(["yes", "yes", "no"]) }) transformer = DataTransformer() transformer._column_transform_info_list = [ ColumnTransformInfo( column_name="x", column_type="continuous", transform=None, transform_aux=None, output_info=[SpanInfo(1, 'tanh'), SpanInfo(3, 'softmax')], output_dimensions=1 + 3), ColumnTransformInfo(column_name="y", column_type="discrete", transform=None, transform_aux=None, output_info=[SpanInfo(2, 'softmax')], output_dimensions=2) ] transformer._transform_continuous = Mock() selected_normalized_value = np.array([[0.1], [0.3], [0.5]]) selected_component_onehot = np.array([ [1, 0, 0], [1, 0, 0], [1, 0, 0], ]) return_value = (selected_normalized_value, selected_component_onehot) transformer._transform_continuous.return_value = return_value transformer._transform_discrete = Mock() transformer._transform_discrete.return_value = [ np.array([ [0, 1], [0, 1], [1, 0], ]) ] result = transformer.transform(data) transformer._transform_continuous.assert_called_once() transformer._transform_discrete.assert_called_once() expected = np.array([ [0.1, 1, 0, 0, 0, 1], [0.3, 1, 0, 0, 0, 1], [0.5, 1, 0, 0, 1, 0], ]) assert result.shape == (3, 6) assert (result[:, 0] == expected[:, 0]).all(), "continuous-cdf" assert (result[:, 1:4] == expected[:, 1:4]).all(), "continuous-softmax" assert (result[:, 4:6] == expected[:, 4:6]).all(), "discrete"
def test__inverse_transform_continuous(self, MockBGM): """Test ``_inverse_transform_continuous``. Setup: - Create ``DataTransformer``. - Mock the ``BayesGMMTransformer`` where: - ``get_output_types`` returns the appropriate dictionary. - ``reverse_transform`` returns some dataframe. Input: - A ``ColumnTransformInfo`` object. - A np.ndarray where: - The first column contains the normalized value - The remaining columns correspond to the one-hot - sigmas = np.ndarray of floats - st = index of the sigmas ndarray Output: - Dataframe where the first column are floats and the second is a lable encoding. Side Effects: - The ``reverse_transform`` method should be called with a dataframe where the first column are floats and the second is a lable encoding. """ # Setup bgm_instance = MockBGM.return_value bgm_instance.get_output_types.return_value = { 'x.normalized': 'numerical', 'x.component': 'numerical' } bgm_instance.reverse_transform.return_value = pd.DataFrame({ 'x.normalized': [0.1, 0.2, 0.3], 'x.component': [0.0, 1.0, 1.0] }) transformer = DataTransformer() column_data = np.array([ [0.1, 1, 0, 0], [0.3, 0, 1, 0], [0.5, 0, 1, 0], ]) column_transform_info = ColumnTransformInfo( column_name='x', column_type='continuous', transform=bgm_instance, output_info=[SpanInfo(1, 'tanh'), SpanInfo(3, 'softmax')], output_dimensions=1 + 3) # Run result = transformer._inverse_transform_continuous( column_transform_info, column_data, None, None) # Assert expected = pd.DataFrame({ 'x.normalized': [0.1, 0.2, 0.3], 'x.component': [0.0, 1.0, 1.0] }) np.testing.assert_array_equal(result, expected) expected_data = pd.DataFrame({ 'x.normalized': [0.1, 0.3, 0.5], 'x.component': [0, 1, 1] }) pd.testing.assert_frame_equal( bgm_instance.reverse_transform.call_args[0][0], expected_data)
def test_transform(self): """Test ``transform`` on a dataframe with one continuous and one discrete columns. It should use the appropriate ``_transform`` type for each column and should return them concanenated appropriately. Setup: - Initialize a ``DataTransformer`` with a ``column_transform_info`` detailing a continuous and a discrete columns. - Mock the ``_transform_discrete`` and ``_transform_continuous`` methods. Input: - A table with one continuous and one discrete columns. Output: - np.array containing the transformed columns. Side Effects: - ``_transform_discrete`` and ``_transform_continuous`` should each be called once. """ # Setup data = pd.DataFrame({ 'x': np.array([0.1, 0.3, 0.5]), 'y': np.array(['yes', 'yes', 'no']) }) transformer = DataTransformer() transformer._column_transform_info_list = [ ColumnTransformInfo( column_name='x', column_type='continuous', transform=None, output_info=[SpanInfo(1, 'tanh'), SpanInfo(3, 'softmax')], output_dimensions=1 + 3), ColumnTransformInfo(column_name='y', column_type='discrete', transform=None, output_info=[SpanInfo(2, 'softmax')], output_dimensions=2) ] transformer._transform_continuous = Mock() selected_normalized_value = np.array([[0.1], [0.3], [0.5]]) selected_component_onehot = np.array([ [1, 0, 0], [0, 1, 0], [0, 1, 0], ]) return_value = np.concatenate( (selected_normalized_value, selected_component_onehot), axis=1) transformer._transform_continuous.return_value = return_value transformer._transform_discrete = Mock() transformer._transform_discrete.return_value = np.array([ [0, 1], [0, 1], [1, 0], ]) # Run result = transformer.transform(data) # Assert transformer._transform_continuous.assert_called_once() transformer._transform_discrete.assert_called_once() expected = np.array([ [0.1, 1, 0, 0, 0, 1], [0.3, 0, 1, 0, 0, 1], [0.5, 0, 1, 0, 1, 0], ]) assert result.shape == (3, 6) assert (result[:, 0] == expected[:, 0]).all(), 'continuous-cdf' assert (result[:, 1:4] == expected[:, 1:4]).all(), 'continuous-softmax' assert (result[:, 4:6] == expected[:, 4:6]).all(), 'discrete'