def test_flatten_model(self): """flatten_model returns a pandas.Series with all the params to recreate a model.""" # Setup model = GaussianMultivariate() X = np.eye(3) model.fit(X) expected_result = pd.Series({ 'covariance__0__0': 1.5000000000000004, 'covariance__1__0': -0.7500000000000003, 'covariance__1__1': 1.5000000000000004, 'covariance__2__0': -0.7500000000000003, 'covariance__2__1': -0.7500000000000003, 'covariance__2__2': 1.5000000000000007, 'distribs__0__mean': 0.33333333333333331, 'distribs__0__std': -0.7520386983881371, 'distribs__1__mean': 0.33333333333333331, 'distribs__1__std': -0.7520386983881371, 'distribs__2__mean': 0.33333333333333331, 'distribs__2__std': -0.7520386983881371, }) data_navigator = MagicMock() modeler = Modeler(data_navigator) # Run result = modeler.flatten_model(model) # Check assert np.isclose(result, expected_result).all()
def test_model_database(self): """Test model using RCPA""" # Setup def rcpa_side_effect(table_name, tables): tables[table_name] = table_name metadata_table_names = ['foo', 'bar', 'tar'] metadata_parents = [None, 'bar_parent', None] modeler = Mock() modeler.metadata.get_tables.return_value = metadata_table_names modeler.metadata.get_parents.side_effect = metadata_parents modeler.rcpa.side_effect = rcpa_side_effect modeler.models = dict() # Run Modeler.model_database(modeler) # Asserts expected_metadata_parents_call_count = 3 expected_metadata_parents_call = [ call('foo'), call('bar'), call('tar') ] assert modeler.metadata.get_parents.call_count == expected_metadata_parents_call_count assert modeler.metadata.get_parents.call_args_list == expected_metadata_parents_call
def test__get_model_dict(self): """_get_model_dict returns a pandas.Series with all the params to recreate a model.""" # Setup X = np.eye(3) expected_result = { 'covariance__0__0': 1.5000000000000009, 'covariance__1__0': -0.7500000000000003, 'covariance__1__1': 1.5000000000000009, 'covariance__2__0': -0.7500000000000003, 'covariance__2__1': -0.7500000000000003, 'covariance__2__2': 1.5000000000000007, 'distribs__0__mean': 0.33333333333333331, 'distribs__0__std': -0.7520386983881371, 'distribs__1__mean': 0.33333333333333331, 'distribs__1__std': -0.7520386983881371, 'distribs__2__mean': 0.33333333333333331, 'distribs__2__std': -0.7520386983881371, } data_navigator = MagicMock() modeler = Modeler(data_navigator) # Run result = modeler._get_model_dict(X) # Check assert result == expected_result
def fit(self, metadata, tables=None, root_path=None): """Fit this SDV instance to the dataset data. Args: metadata (dict, str or Metadata): Metadata dict, path to the metadata JSON file or Metadata instance itself. tables (dict): Dictionary with the table names as key and ``pandas.DataFrame`` instances as values. If ``None`` is given, the tables will be loaded from the paths indicated in ``metadata``. Defaults to ``None``. root_path (str or None): Path to the dataset directory. If ``None`` and metadata is a path, the metadata location is used. If ``None`` and metadata is a dict, the current working directory is used. """ if isinstance(metadata, Metadata): self.metadata = metadata else: self.metadata = Metadata(metadata, root_path) self.metadata.validate(tables) self.modeler = Modeler(self.metadata, self.model, self.model_kwargs) self.modeler.model_database(tables) self.sampler = Sampler(self.metadata, self.modeler.models, self.model, self.model_kwargs)
def test_model_database_kde_distribution(self): """model_database works fine with kde distribution.""" # Setup modeler = Modeler(data_navigator=self.dn, distribution=KDEUnivariate) # Run modeler.model_database()
def test_model_database_vine_modeler_single_table(self): """model_database works fine with vine modeler.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = Modeler(data_navigator=data_navigator, model=VineCopula) # Setup - Mock data = pd.DataFrame({ 'column_A': list('abdc'), 'column_B': range(4) }) meta = { 'name': 'table_name', 'fields': { 'column_A': { 'name': 'A', 'type': 'categorical' }, 'column_B': { 'name': 'B', 'type': 'number', 'subtype': 'integer' } } } data_navigator.tables = { 'table_name': Table(data, meta) } data_navigator.get_parents.return_value = set() data_navigator.get_children.return_value = set() data_navigator.transformed_data = { 'table_name': pd.DataFrame({ 'column_A': [0.1, 0.2, 0.5, 1.0], 'column_B': range(4) }) } data_navigator.meta = { 'tables': [ { 'name': meta } ] } data_navigator.ht = MagicMock() data_navigator.ht.transformers = { ('table_name', 'column_A'): None, ('table_name', 'column_B'): None } # Run modeler.model_database() # Check assert 'table_name' in modeler.models sampler = Sampler(data_navigator, modeler) samples = sampler.sample_all() assert 'table_name' in samples
def fit(self): """Transform the data and model the database.""" data_loader = CSVDataLoader(self.meta_file_name) self.dn = data_loader.load_data() # transform data self.dn.transform_data() self.modeler = Modeler(self.dn) self.modeler.model_database() self.sampler = Sampler(self.dn, self.modeler)
def test_model_database(self, rcpa_mock, impute_mock, fit_mock): """model_database computes conditions between tables and models them.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = Modeler(data_navigator) data_navigator.tables = OrderedDict() data_navigator.tables['table_A'] = 'table_A_dataframe' data_navigator.tables['table_B'] = 'table_B_dataframe' data_navigator.tables['table_C'] = 'table_C_dataframe' parents = { 'table_A': {}, 'table_B': {'table_A'}, 'table_C': {'table_B'} } data_navigator.get_parents.side_effect = lambda x: parents[x] def rcpa_side_effect(*args): modeler.tables = data_navigator.tables rcpa_mock.side_effect = rcpa_side_effect impute_mock.side_effect = ['TABLE_A', 'TABLE_B', 'TABLE_C'] fit_mock.side_effect = lambda x: 'model_for_{}'.format(x) # Run modeler.model_database() # Check assert data_navigator.get_parents.call_args_list == [ (('table_A', ), ), (('table_B', ), ), (('table_C', ), ), ] rcpa_mock.assert_called_once_with('table_A') assert impute_mock.call_args_list == [ (('table_A_dataframe', ), ), (('table_B_dataframe', ), ), (('table_C_dataframe', ), ), ] assert fit_mock.call_args_list == [ (('TABLE_A', ), ), (('TABLE_B', ), ), (('TABLE_C', ), ), ] assert modeler.models == { 'table_A': 'model_for_TABLE_A', 'table_B': 'model_for_TABLE_B', 'table_C': 'model_for_TABLE_C' }
def test_model_database_raises(self, rcpa_mock): """If the models raise an exception, it prints a custom message.""" # Setup data_navigator = MagicMock() modeler = Modeler(data_navigator) data_navigator.tables = ['table_1', 'table_2'] data_navigator.get_parents.return_value = False rcpa_mock.side_effect = ValueError('value error!') # Run / Check with self.assertRaises(ValueError): modeler.model_database()
def test__create_extension(self, impute_mock, fit_mock, flatten_mock): """Tests that the create extension method returns correct parameters.""" # Setup data_navigator = MagicMock() modeler = Modeler(data_navigator) table = pd.DataFrame({ 'foreign': [0, 1, 0, 1, 0, 1], 'a': [0, 1, 0, 1, 0, 1], 'b': [1, 2, 3, 4, 5, 6] }) foreign = table[table.a == 0] table_info = ('foreign', 'child') impute_mock.return_value = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) fit_mock.return_value = 'fitted model' flatten_mock.return_value = pd.Series({ 'covariance__0__0': 0.0, 'covariance__1__0': 0.0, 'covariance__1__1': 1.4999999999999991, 'distribs__a__mean': 0.0, 'distribs__a__std': 0.001, 'distribs__b__mean': 3.0, 'distribs__b__std': 1.632993161855452 }) # Run result = modeler._create_extension(foreign, table, table_info) # Check assert result.equals(flatten_mock.return_value) df = pd.DataFrame({'a': [0, 1, 0, 1, 0, 1], 'b': [1, 2, 3, 4, 5, 6]}) df = df.loc[foreign.index] assert len(impute_mock.call_args_list) call_args = impute_mock.call_args_list[0] assert len(call_args[0]) == 1 assert call_args[0][0].equals(df) assert call_args[1] == {} fit_mock.assert_called_once_with(impute_mock.return_value) flatten_mock.assert_called_once_with('fitted model', 'child')
def test__create_extension_wrong_index_return_none(self): """_create_extension return None if transformed_child_table can't be indexed by df.""" # Setup data_navigator = MagicMock() modeler = Modeler(data_navigator) transformed_child_table = pd.DataFrame(np.eye(3), columns=['A', 'B', 'C']) table_info = ('', '') df = pd.DataFrame(index=range(5, 10)) # Run result = modeler._create_extension(df, transformed_child_table, table_info) # Check assert result is None
def fit(self): """Transform the data and model the database. Raises: ValueError: If the provided dataset has an unsupported structure. """ data_loader = CSVDataLoader(self.meta_file_name) self.dn = data_loader.load_data() self._check_unsupported_dataset_structure() self.dn.transform_data() self.modeler = Modeler(self.dn) self.modeler.model_database() self.sampler = Sampler(self.dn, self.modeler)
def test__flatten_array_list(self): """_flatten_array return a dict formed from the input list""" # Setup nested = [ [1, 0, 0], [0, 1, 0], [0, 0, 1] ] expected_result = { '0__0': 1, '0__1': 0, '0__2': 0, '1__0': 0, '1__1': 1, '1__2': 0, '2__0': 0, '2__1': 0, '2__2': 1 } # Run result = Modeler._flatten_array(nested) # Check assert result == expected_result
def test_fit_model_distribution_arg(self): """fit_model will pass self.distribution FQN to modeler.""" # Setup model_mock = MagicMock() model_mock.__eq__.return_value = True model_mock.__ne__.return_value = False modeler = Modeler(data_navigator='navigator', model=model_mock, distribution=KDEUnivariate) data = pd.DataFrame({ 'column': [0, 1, 1, 1, 0], }) # Run modeler.fit_model(data) # Check model_mock.assert_called_once_with(distribution='copulas.univariate.kde.KDEUnivariate')
def setUpClass(cls): data_loader = CSVDataLoader('tests/data/meta.json') cls.data_navigator = data_loader.load_data() cls.data_navigator.transform_data() cls.modeler = Modeler(cls.data_navigator) cls.modeler.model_database()
def test_cpa_with_tables_no_primary_key(self): """Test CPA with tables and no primary key.""" # Setup modeler = Mock(spec=Modeler) modeler.metadata = Mock(spec=Metadata) modeler.models = dict() modeler.metadata.transform.return_value = pd.DataFrame( {'data': [1, 2, 3]}) modeler.metadata.get_primary_key.return_value = None modeler._fit_model.return_value = 'fitted model' # Run tables = {'test': pd.DataFrame({'data': ['a', 'b', 'c']})} result = Modeler.cpa(modeler, 'test', tables) # Asserts expected = pd.DataFrame({'data': [1, 2, 3]}) expected_transform_call = pd.DataFrame({'data': ['a', 'b', 'c']}) assert modeler.metadata.load_table.call_count == 0 assert modeler.metadata.transform.call_args[0][0] == 'test' pd.testing.assert_frame_equal( modeler.metadata.transform.call_args[0][1], expected_transform_call) pd.testing.assert_frame_equal(modeler._fit_model.call_args[0][0], expected) pd.testing.assert_frame_equal(result, expected)
def test__get_extensions(self): """Test get list of extensions from childs""" # Setup model = Mock(spec=SDVModel) model.return_value = model model.get_parameters.side_effect = [{ 'model': 'data 1' }, { 'model': 'data 2' }, { 'model': 'data 3' }] modeler = Mock(spec=Modeler) modeler.model = model modeler.model_kwargs = dict() modeler.metadata = Mock(spec=Metadata) # Run child_table = pd.DataFrame({'foo': ['aaa', 'bbb', 'ccc']}) result = Modeler._get_extension(modeler, 'some_name', child_table, 'foo') # Asserts expected = pd.DataFrame( { '__some_name__model': ['data 1', 'data 2', 'data 3'], '__some_name__child_rows': [1, 1, 1] }, index=['aaa', 'bbb', 'ccc']) pd.testing.assert_frame_equal(result, expected) assert model.get_parameters.call_count == 3
def test__get_extensions(self): """Test get list of extensions from childs""" # Setup modeler = Mock() model_dict = [{ 'model': 'data 1' }, { 'model': 'data 2' }, { 'model': 'data 3' }] modeler._get_model_dict.side_effect = model_dict # Run child_table = pd.DataFrame({'foo': ['aaa', 'bbb', 'ccc']}) result = Modeler._get_extension(modeler, 'some_name', child_table, 'foo') # Asserts expected = pd.DataFrame( { '__some_name__model': ['data 1', 'data 2', 'data 3'], '__some_name__child_rows': [1, 1, 1] }, index=['aaa', 'bbb', 'ccc']) pd.testing.assert_frame_equal(result, expected) assert modeler._get_model_dict.call_count == 3
def test__flatten_dict_missing_keys_gh_89(self): """flatten_dict will only ignore keys that don't have dict or list values. https://github.com/HDI-Project/SDV/issues/89 """ # Setup nested_dict = { 'covariance': [ [1.4999999999999991, 1.4999999999999991, 1.4999999999999991], [1.4999999999999991, 1.4999999999999991, 1.4999999999999991], [1.4999999999999991, 1.4999999999999991, 1.4999999999999991]], 'distribs': { 'type': { 'type': 'copulas.univariate.gaussian.GaussianUnivariate', 'fitted': True, 'mean': 4.0, 'std': 2.449489742783178 }, 'distribution': { 'type': 'copulas.univariate.gaussian.GaussianUnivariate', 'fitted': True, 'mean': 5.0, 'std': 2.449489742783178 }, 'fitted': { 'type': 'copulas.univariate.gaussian.GaussianUnivariate', 'fitted': True, 'mean': 6.0, 'std': 2.449489742783178 } }, 'type': 'copulas.multivariate.gaussian.GaussianMultivariate', 'fitted': True, 'distribution': 'copulas.univariate.gaussian.GaussianUnivariate' } expected_result = { 'covariance__0__0': 1.4999999999999991, 'covariance__0__1': 1.4999999999999991, 'covariance__0__2': 1.4999999999999991, 'covariance__1__0': 1.4999999999999991, 'covariance__1__1': 1.4999999999999991, 'covariance__1__2': 1.4999999999999991, 'covariance__2__0': 1.4999999999999991, 'covariance__2__1': 1.4999999999999991, 'covariance__2__2': 1.4999999999999991, 'distribs__type__mean': 4.0, 'distribs__type__std': 2.449489742783178, 'distribs__distribution__mean': 5.0, 'distribs__distribution__std': 2.449489742783178, 'distribs__fitted__mean': 6.0, 'distribs__fitted__std': 2.449489742783178 } # Run result = Modeler._flatten_dict(nested_dict) # Check assert result == expected_result
class SDV: """Class to do modeling and sampling all in one.""" def __init__(self, meta_file_name, data_loader_type='csv'): """Initialize sdv class.""" self.meta_file_name = meta_file_name self.sampler = None def fit(self): """Transform the data and model the database.""" data_loader = CSVDataLoader(self.meta_file_name) self.dn = data_loader.load_data() # transform data self.dn.transform_data() self.modeler = Modeler(self.dn) self.modeler.model_database() self.sampler = Sampler(self.dn, self.modeler) def sample_rows(self, table_name, num_rows): """Wrapper for Sampler.sample_rows.""" if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_rows(table_name, num_rows) def sample_table(self, table_name): """Wrapper for Sampler.sample_table.""" if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_table(table_name) def sample_all(self, num_rows=5): """Wrapper for Sampler.sample_all.""" if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_all(num_rows) def save(self, filename): """Save SDV instance to file destination. Args: file_destination (string): path to store file. """ with open(filename, 'wb') as output: pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)
def test___init__distribution_no_default_model(self): """Check that __init__ with distribution and model is not the default raise exception""" # Run and asserts with self.assertRaises(ValueError): Modeler(data_navigator=None, model=VineCopula, distribution=KDEUnivariate)
def test__flatten_array(self): """Test get flatten array""" # Run nested = [['foo', 'bar'], 'tar'] result = Modeler._flatten_array(nested, prefix='test') # Asserts expected = {'test__0__0': 'foo', 'test__0__1': 'bar', 'test__1': 'tar'} assert result == expected
def test___init__with_arguments(self): # Run modeler = Modeler({'some': 'metadata'}, model=VineCopula, model_kwargs={'some': 'kwargs'}) # Asserts assert modeler.models == dict() assert modeler.metadata == {'some': 'metadata'} assert modeler.model == VineCopula assert modeler.model_kwargs == {'some': 'kwargs'}
def test_CPA(self, extensions_mock, merge_mock): """CPA will append extensions to the original table.""" # Setup data_navigator = MagicMock(spec=DataNavigator) table = Table(pd.DataFrame({'table_pk': range(5)}), {'primary_key': 'table_pk'}) data_navigator.tables = {'table': table} transformed_table = pd.DataFrame({'table_pk': range(5)}) data_navigator.transformed_data = {'table': transformed_table} data_navigator.get_children.return_value = 'children of table' modeler = Modeler(data_navigator) extension = MagicMock() extensions_mock.return_value = [extension] extended_table = MagicMock() merge_mock.return_value = extended_table table_name = 'table' # Run modeler.CPA(table_name) # Check assert modeler.tables[table_name] == extended_table extensions_mock.assert_called_once_with(modeler, 'table_pk', 'children of table') merge_mock.assert_called_once_with(transformed_table, extension.reset_index.return_value, how='left', on='table_pk') data_navigator.get_children.assert_called_once_with('table') extension.reset_index.assert_called_once_with() extended_table.drop.assert_not_called() call_args_list = extended_table.__setitem__.call_args_list assert len(call_args_list) == 1 args, kwargs = call_args_list[0] assert kwargs == {} assert len(args) == 2 assert args[0] == 'table_pk' assert args[1].equals(transformed_table['table_pk'])
def test___init__default(self): """Test create new Modeler instance with default values""" # Run modeler = Modeler('test') # Asserts assert modeler.models == dict() assert modeler.metadata == 'test' assert modeler.model == GaussianMultivariate assert modeler.model_kwargs == dict()
def test__get_extensions(self, get_foreign_mock, extension_mock): """_get_extensions return the conditional modelling parameters for each children.""" # Setup data_navigator = MagicMock() first_table_data = pd.DataFrame({'foreign_key': [0, 1]}) first_table_meta = {'fields': []} data_navigator.tables = { 'first_children': Table(first_table_data, first_table_meta), 'second_children': Table(first_table_data, first_table_meta), } data_navigator.get_children.return_value = {} modeler = Modeler(data_navigator) modeler.tables = {} extension_mock.side_effect = lambda x, y, z: None get_foreign_mock.return_value = 'foreign_key' pk = 'primary_key' children = ['first_children', 'second_children'] expected_result = [ pd.DataFrame([{ '__first_children_column_1': 1, '__first_children_column_2': 2 }]), pd.DataFrame([{ '__second_children_column_1': 1, '__second_children_column_2': 2 }]) ] # Run result = modeler._get_extensions(pk, children) # Check assert all([ result[index].equals(expected_result[index]) for index in range(len(result)) ])
def test__flatten_dict_flat_dict(self): """_flatten_dict don't modify flat dicts.""" # Setup nested_dict = {'a': 1, 'b': 2} expected_result = {'a': 1, 'b': 2} # Run result = Modeler._flatten_dict(nested_dict) # Check assert result == expected_result
def test__impute(self): """Test _impute data""" # Setup data = pd.DataFrame({'foo': [0, None, 1], 'bar': ['a', None, 'b']}) # Run result = Modeler._impute(data) # Asserts expected = pd.DataFrame({'foo': [0, 0.5, 1], 'bar': ['a', 'a', 'b']}) pd.testing.assert_frame_equal(result, expected)
def test_impute_table_with_mean_default(self): """If a column only has NaN, impute_table fills it with 0.(+EPSILON). If a column has no mean (all values are null), then the NaN values are replaced with 0. Then, it will transform like a constant column, adding copulas.EPSILON at the first element. """ # Setup table = pd.DataFrame([ { 'A': np.nan, 'B': 2., 'C': 2. }, { 'A': np.nan, 'B': 3., 'C': 3. }, { 'A': np.nan, 'B': 4., 'C': 4. }, ]) expected_result = pd.DataFrame([ { 'A': EPSILON, 'B': 2., 'C': 2. }, { 'A': 0., 'B': 3., 'C': 3. }, { 'A': 0., 'B': 4., 'C': 4. }, ]) # Run result = Modeler.impute_table(table) # Check assert result.equals(expected_result) # No null values are left assert not result.isnull().all().all()
def test_impute_table_with_mean(self): """impute_table fills all NaN values the mean of values when possible.""" # Setup table = pd.DataFrame([ { 'A': np.nan, 'B': 2., 'C': 4. }, { 'A': 4., 'B': np.nan, 'C': 2. }, { 'A': 2., 'B': 4., 'C': np.nan }, ]) expected_result = pd.DataFrame([ { 'A': 3., 'B': 2., 'C': 4. }, { 'A': 4., 'B': 3., 'C': 2. }, { 'A': 2., 'B': 4., 'C': 3. }, ]) # Run result = Modeler.impute_table(table) # Check assert result.equals(expected_result) # No null values are left assert not result.isnull().all().all() # Averages are computed on every column for column in result: assert 0 not in result[column].values