示例#1
0
    def test_model_database_kde_distribution(self):
        """model_database works fine with kde distribution."""
        # Setup
        modeler = Modeler(data_navigator=self.dn, distribution=KDEUnivariate)

        # Run
        modeler.model_database()
示例#2
0
    def test_model_database(self):
        """Test model using RCPA"""

        # Setup
        def rcpa_side_effect(table_name, tables):
            tables[table_name] = table_name

        metadata_table_names = ['foo', 'bar', 'tar']
        metadata_parents = [None, 'bar_parent', None]

        modeler = Mock()
        modeler.metadata.get_tables.return_value = metadata_table_names
        modeler.metadata.get_parents.side_effect = metadata_parents
        modeler.rcpa.side_effect = rcpa_side_effect
        modeler.models = dict()

        # Run
        Modeler.model_database(modeler)

        # Asserts
        expected_metadata_parents_call_count = 3
        expected_metadata_parents_call = [
            call('foo'), call('bar'), call('tar')
        ]
        assert modeler.metadata.get_parents.call_count == expected_metadata_parents_call_count
        assert modeler.metadata.get_parents.call_args_list == expected_metadata_parents_call
示例#3
0
    def test_model_database_vine_modeler_single_table(self):
        """model_database works fine with vine modeler."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = Modeler(data_navigator=data_navigator, model=VineCopula)

        # Setup - Mock
        data = pd.DataFrame({
            'column_A': list('abdc'),
            'column_B': range(4)
        })
        meta = {
            'name': 'table_name',
            'fields': {
                'column_A': {
                    'name': 'A',
                    'type': 'categorical'
                },
                'column_B': {
                    'name': 'B',
                    'type': 'number',
                    'subtype': 'integer'
                }
            }
        }

        data_navigator.tables = {
            'table_name': Table(data, meta)
        }
        data_navigator.get_parents.return_value = set()
        data_navigator.get_children.return_value = set()
        data_navigator.transformed_data = {
            'table_name': pd.DataFrame({
                'column_A': [0.1, 0.2, 0.5, 1.0],
                'column_B': range(4)
            })
        }
        data_navigator.meta = {
            'tables': [
                {
                    'name': meta
                }
            ]
        }
        data_navigator.ht = MagicMock()
        data_navigator.ht.transformers = {
            ('table_name', 'column_A'): None,
            ('table_name', 'column_B'): None
        }

        # Run
        modeler.model_database()

        # Check
        assert 'table_name' in modeler.models

        sampler = Sampler(data_navigator, modeler)
        samples = sampler.sample_all()
        assert 'table_name' in samples
示例#4
0
    def test_model_database(self, rcpa_mock, impute_mock, fit_mock):
        """model_database computes conditions between tables and models them."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = Modeler(data_navigator)

        data_navigator.tables = OrderedDict()
        data_navigator.tables['table_A'] = 'table_A_dataframe'
        data_navigator.tables['table_B'] = 'table_B_dataframe'
        data_navigator.tables['table_C'] = 'table_C_dataframe'

        parents = {
            'table_A': {},
            'table_B': {'table_A'},
            'table_C': {'table_B'}
        }
        data_navigator.get_parents.side_effect = lambda x: parents[x]

        def rcpa_side_effect(*args):
            modeler.tables = data_navigator.tables

        rcpa_mock.side_effect = rcpa_side_effect
        impute_mock.side_effect = ['TABLE_A', 'TABLE_B', 'TABLE_C']
        fit_mock.side_effect = lambda x: 'model_for_{}'.format(x)

        # Run
        modeler.model_database()

        # Check
        assert data_navigator.get_parents.call_args_list == [
            (('table_A', ), ),
            (('table_B', ), ),
            (('table_C', ), ),
        ]

        rcpa_mock.assert_called_once_with('table_A')
        assert impute_mock.call_args_list == [
            (('table_A_dataframe', ), ),
            (('table_B_dataframe', ), ),
            (('table_C_dataframe', ), ),
        ]

        assert fit_mock.call_args_list == [
            (('TABLE_A', ), ),
            (('TABLE_B', ), ),
            (('TABLE_C', ), ),
        ]

        assert modeler.models == {
            'table_A': 'model_for_TABLE_A',
            'table_B': 'model_for_TABLE_B',
            'table_C': 'model_for_TABLE_C'
        }
示例#5
0
    def test_model_database_raises(self, rcpa_mock):
        """If the models raise an exception, it prints a custom message."""
        # Setup
        data_navigator = MagicMock()
        modeler = Modeler(data_navigator)

        data_navigator.tables = ['table_1', 'table_2']
        data_navigator.get_parents.return_value = False
        rcpa_mock.side_effect = ValueError('value error!')

        # Run / Check
        with self.assertRaises(ValueError):
            modeler.model_database()
示例#6
0
文件: sdv.py 项目: robertsievert/SDV
class SDV:
    """Class to do modeling and sampling all in one."""
    def __init__(self, meta_file_name, data_loader_type='csv'):
        """Initialize sdv class."""
        self.meta_file_name = meta_file_name
        self.sampler = None

    def fit(self):
        """Transform the data and model the database."""
        data_loader = CSVDataLoader(self.meta_file_name)
        self.dn = data_loader.load_data()
        # transform data
        self.dn.transform_data()
        self.modeler = Modeler(self.dn)
        self.modeler.model_database()
        self.sampler = Sampler(self.dn, self.modeler)

    def sample_rows(self, table_name, num_rows):
        """Wrapper for Sampler.sample_rows."""
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_rows(table_name, num_rows)

    def sample_table(self, table_name):
        """Wrapper for Sampler.sample_table."""
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_table(table_name)

    def sample_all(self, num_rows=5):
        """Wrapper for Sampler.sample_all."""
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')
        return self.sampler.sample_all(num_rows)

    def save(self, filename):
        """Save SDV instance to file destination.

        Args:
            file_destination (string): path to store file.
        """
        with open(filename, 'wb') as output:
            pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)
示例#7
0
    def test_model_database_gaussian_copula_single_table(self):
        """model_database can model a single table using the gausian copula model."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = Modeler(data_navigator=data_navigator,
                          model=GaussianMultivariate)

        # Setup - Mocks - DataNavigator
        table_data = pd.DataFrame({
            'column_A': list('abdc'),
            'column_B': range(4)
        })
        table_metadata = {
            'name': 'table_name',
            'fields': {
                'column_A': {
                    'name': 'column_A',
                    'type': 'categorical'
                },
                'column_B': {
                    'name': 'column_B',
                    'type': 'number',
                    'subtype': 'integer'
                }
            }
        }

        data_navigator.tables = {
            'table_name': Table(table_data, table_metadata)
        }
        data_navigator.get_parents.return_value = set()
        data_navigator.get_children.return_value = set()
        data_navigator.transformed_data = {
            'table_name':
            pd.DataFrame({
                'column_A': [0.1, 0.2, 0.5, 1.0],
                'column_B': range(4)
            })
        }
        metadata = {
            'name':
            'table_name',
            'fields': [{
                'name': 'column_A',
                'type': 'categorical'
            }, {
                'name': 'column_B',
                'type': 'number',
                'subtype': 'integer'
            }]
        }

        data_navigator.meta = {'tables': [metadata]}
        data_navigator.ht = MagicMock()
        data_navigator.ht.transformers = {
            ('table_name', 'column_A'): None,
            ('table_name', 'column_B'): None
        }

        # Run
        modeler.model_database()

        # Check
        assert 'table_name' in modeler.models

        sampler = Sampler(data_navigator, modeler)
        samples = sampler.sample_all()
        assert 'table_name' in samples
示例#8
0
class TestModeler(TestCase):
    def setUp(self):
        """Set up test fixtures, if any."""
        dl = CSVDataLoader('tests/data/meta.json')
        self.dn = dl.load_data()
        self.dn.transform_data()
        self.modeler = Modeler(self.dn)

    @patch('sdv.modeler.Modeler.flatten_model')
    @patch('sdv.modeler.Modeler.fit_model')
    @patch('sdv.modeler.Modeler.impute_table')
    def test__create_extension(self, impute_mock, fit_mock, flatten_mock):
        """Tests that the create extension method returns correct parameters."""
        # Setup
        data_navigator = MagicMock()
        modeler = Modeler(data_navigator)
        table = pd.DataFrame({
            'foreign': [0, 1, 0, 1, 0, 1],
            'a': [0, 1, 0, 1, 0, 1],
            'b': [1, 2, 3, 4, 5, 6]
        })
        foreign = table[table.a == 0]
        table_info = ('foreign', 'child')

        impute_mock.return_value = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})

        fit_mock.return_value = 'fitted model'

        flatten_mock.return_value = pd.Series({
            'covariance__0__0':
            0.0,
            'covariance__1__0':
            0.0,
            'covariance__1__1':
            1.4999999999999991,
            'distribs__a__mean':
            0.0,
            'distribs__a__std':
            0.001,
            'distribs__b__mean':
            3.0,
            'distribs__b__std':
            1.632993161855452
        })

        # Run
        result = modeler._create_extension(foreign, table, table_info)

        # Check
        assert result.equals(flatten_mock.return_value)

        df = pd.DataFrame({'a': [0, 1, 0, 1, 0, 1], 'b': [1, 2, 3, 4, 5, 6]})
        df = df.loc[foreign.index]

        assert len(impute_mock.call_args_list)
        call_args = impute_mock.call_args_list[0]
        assert len(call_args[0]) == 1
        assert call_args[0][0].equals(df)
        assert call_args[1] == {}

        fit_mock.assert_called_once_with(impute_mock.return_value)
        flatten_mock.assert_called_once_with('fitted model', 'child')

    def test__create_extension_wrong_index_return_none(self):
        """_create_extension return None if transformed_child_table can't be indexed by df."""
        # Setup
        data_navigator = MagicMock()
        modeler = Modeler(data_navigator)
        transformed_child_table = pd.DataFrame(np.eye(3),
                                               columns=['A', 'B', 'C'])
        table_info = ('', '')
        df = pd.DataFrame(index=range(5, 10))

        # Run
        result = modeler._create_extension(df, transformed_child_table,
                                           table_info)

        # Check
        assert result is None

    @patch('sdv.modeler.Modeler._create_extension')
    @patch('sdv.modeler.Modeler.get_foreign_key')
    def test__get_extensions(self, get_foreign_mock, extension_mock):
        """_get_extensions return the conditional modelling parameters for each children."""
        # Setup
        data_navigator = MagicMock()

        first_table_data = pd.DataFrame({'foreign_key': [0, 1]})
        first_table_meta = {'fields': []}

        data_navigator.tables = {
            'first_children': Table(first_table_data, first_table_meta),
            'second_children': Table(first_table_data, first_table_meta),
        }
        data_navigator.get_children.return_value = {}
        modeler = Modeler(data_navigator)
        modeler.tables = {}

        extension_mock.side_effect = lambda x, y, z: None

        get_foreign_mock.return_value = 'foreign_key'

        pk = 'primary_key'
        children = ['first_children', 'second_children']

        expected_result = [
            pd.DataFrame([{
                '__first_children_column_1': 1,
                '__first_children_column_2': 2
            }]),
            pd.DataFrame([{
                '__second_children_column_1': 1,
                '__second_children_column_2': 2
            }])
        ]

        # Run
        result = modeler._get_extensions(pk, children)

        # Check
        assert all([
            result[index].equals(expected_result[index])
            for index in range(len(result))
        ])

    def test_get_extensions_no_children(self):
        """_get_extensions return an empty list if children is empty."""
        # Setup
        pk = 'primary_key'
        children = {}

        expected_result = []

        # Run
        result = self.modeler._get_extensions(pk, children)

        # Check
        assert result == expected_result

    def test_CPA(self):
        """CPA will append extensions to the original table."""
        # Setup
        self.modeler.model_database()
        table_name = 'DEMO_CUSTOMERS'

        # Run
        self.modeler.CPA(table_name)

        # Check
        for name, table in self.modeler.tables.items():
            with self.subTest(table=name):
                raw_table = self.modeler.dn.tables[name].data

                # When we run Conditional Parameter Aggregation we add a key on Modeler.tables
                # for each table. It contains a not null pandas DataFrame with the computed
                # extension.
                assert isinstance(table, pd.DataFrame)

                assert raw_table.shape[0] == table.shape[0]
                assert (raw_table.index == table.index).all()
                assert all(
                    [column in table.columns for column in raw_table.columns])

    @patch('sdv.modeler.Modeler._get_extensions')
    def test_CPA_transformed_index(self, extension_mock):
        """CPA is able to merge extensions in tables with transformed index. """
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = Modeler(data_navigator)

        # Setup - Mock
        parent_data = pd.DataFrame([
            {
                'parent_id': 'A',
                'values': 1
            },
            {
                'parent_id': 'B',
                'values': 2
            },
            {
                'parent_id': 'C',
                'values': 3
            },
        ])
        parent_meta = {
            'name': 'parent',
            'primary_key': 'parent_id',
            'fields': {
                'parent_id': {
                    'name': 'parent_id',
                    'type': 'categorical',
                    'regex': '^[A-Z]$'
                },
                'values': {
                    'name': 'values',
                    'type': 'number',
                    'subtype': 'integer'
                }
            }
        }

        child_data = pd.DataFrame([
            {
                'child_id': 1,
                'parent_id': 'A',
                'value': 0.1
            },
            {
                'child_id': 2,
                'parent_id': 'A',
                'value': 0.2
            },
            {
                'child_id': 3,
                'parent_id': 'A',
                'value': 0.3
            },
            {
                'child_id': 4,
                'parent_id': 'B',
                'value': 0.4
            },
            {
                'child_id': 5,
                'parent_id': 'B',
                'value': 0.5
            },
            {
                'child_id': 6,
                'parent_id': 'B',
                'value': 0.6
            },
            {
                'child_id': 7,
                'parent_id': 'C',
                'value': 0.7
            },
            {
                'child_id': 8,
                'parent_id': 'C',
                'value': 0.8
            },
            {
                'child_id': 9,
                'parent_id': 'C',
                'value': 0.9
            },
        ])
        child_meta = {
            'name': 'child',
            'primary_key': 'child_id',
            'fields': {
                'child_id': {
                    'name': 'child_id',
                    'type': 'number'
                },
                'parent_id': {
                    'name': 'parent_id',
                    'type': 'category',
                    'ref': {
                        'table': 'parent',
                        'field': 'parent_id'
                    }
                },
                'value': {
                    'name': 'value',
                    'type': 'number'
                }
            }
        }

        data_navigator.tables = {
            'parent': Table(parent_data, parent_meta),
            'child': Table(child_data, child_meta)
        }

        children_map = {'parent': {'child'}}
        parent_map = {'child': {'parent'}}

        data_navigator.get_children.side_effect = lambda x: children_map.get(
            x, set())
        data_navigator.get_parents.side_effect = lambda x: parent_map.get(
            x, set())

        transformed_parent = pd.DataFrame([
            {
                'parent_id': 0.1,
                'values': 1
            },
            {
                'parent_id': 0.4,
                'values': 2
            },
            {
                'parent_id': 0.8,
                'values': 3
            },
        ])
        transformed_child = pd.DataFrame([
            {
                'child_id': 1,
                'parent_id': 0.15,
                'value': 0.1
            },
            {
                'child_id': 2,
                'parent_id': 0.10,
                'value': 0.2
            },
            {
                'child_id': 3,
                'parent_id': 0.20,
                'value': 0.3
            },
            {
                'child_id': 4,
                'parent_id': 0.35,
                'value': 0.4
            },
            {
                'child_id': 5,
                'parent_id': 0.50,
                'value': 0.5
            },
            {
                'child_id': 6,
                'parent_id': 0.55,
                'value': 0.6
            },
            {
                'child_id': 7,
                'parent_id': 0.70,
                'value': 0.7
            },
            {
                'child_id': 8,
                'parent_id': 0.80,
                'value': 0.8
            },
            {
                'child_id': 9,
                'parent_id': 0.85,
                'value': 0.9
            },
        ])

        data_navigator.transformed_data = {
            'parent': transformed_parent,
            'child': transformed_child
        }
        extension = pd.DataFrame(
            **{
                'data': [
                    {
                        'param_1': 0.5,
                        'param_2': 0.4
                    },
                    {
                        'param_1': 0.7,
                        'param_2': 0.2
                    },
                    {
                        'param_1': 0.2,
                        'param_2': 0.1
                    },
                ],
                'index':
                list('ABC')
            })
        extension.index.name = 'parent_id'
        extension_mock.return_value = [extension]

        expected_extended_parent = pd.DataFrame(
            [
                {
                    'parent_id': 0.1,
                    'values': 1,
                    'param_1': 0.5,
                    'param_2': 0.4
                },
                {
                    'parent_id': 0.4,
                    'values': 2,
                    'param_1': 0.7,
                    'param_2': 0.2
                },
                {
                    'parent_id': 0.8,
                    'values': 3,
                    'param_1': 0.2,
                    'param_2': 0.1
                },
            ],
            columns=['parent_id', 'values', 'param_1', 'param_2'])

        # Run
        modeler.CPA('parent')

        # Check
        'parent' in modeler.tables
        assert modeler.tables['parent'].equals(expected_extended_parent)

        data_navigator.get_children.assert_called_once_with('parent')
        extension_mock.assert_called_once_with('parent_id', {'child'})

    def test_flatten_model(self):
        """flatten_model returns a pandas.Series with all the params to recreate a model."""
        # Setup
        model = GaussianMultivariate()
        X = np.eye(3)
        model.fit(X)

        expected_result = pd.Series({
            'covariance__0__0': 1.5000000000000004,
            'covariance__1__0': -0.7500000000000003,
            'covariance__1__1': 1.5000000000000004,
            'covariance__2__0': -0.7500000000000003,
            'covariance__2__1': -0.7500000000000003,
            'covariance__2__2': 1.5000000000000007,
            'distribs__0__mean': 0.33333333333333331,
            'distribs__0__std': -0.7520386983881371,
            'distribs__1__mean': 0.33333333333333331,
            'distribs__1__std': -0.7520386983881371,
            'distribs__2__mean': 0.33333333333333331,
            'distribs__2__std': -0.7520386983881371,
        })
        data_navigator = MagicMock()
        modeler = Modeler(data_navigator)

        # Run
        result = modeler.flatten_model(model)

        # Check
        assert np.isclose(result, expected_result).all()

    def test_impute_table_with_mean(self):
        """impute_table fills all NaN values the mean of values when possible."""
        # Setup
        table = pd.DataFrame([
            {
                'A': np.nan,
                'B': 2.,
                'C': 4.
            },
            {
                'A': 4.,
                'B': np.nan,
                'C': 2.
            },
            {
                'A': 2.,
                'B': 4.,
                'C': np.nan
            },
        ])
        expected_result = pd.DataFrame([
            {
                'A': 3.,
                'B': 2.,
                'C': 4.
            },
            {
                'A': 4.,
                'B': 3.,
                'C': 2.
            },
            {
                'A': 2.,
                'B': 4.,
                'C': 3.
            },
        ])

        # Run
        result = self.modeler.impute_table(table)

        # Check
        assert result.equals(expected_result)

        # No null values are left
        assert not result.isnull().all().all()

        # Averages are computed on every column
        for column in result:
            assert 0 not in result[column].values

    def test_impute_table_with_mean_default(self):
        """If a column only has NaN, impute_table fills it with 0.(+EPSILON).

        If a column has no mean (all values are null), then the NaN values are replaced with 0.
        Then, it will transform like a constant column, adding copulas.EPSILON at the
        first element.
        """
        # Setup
        table = pd.DataFrame([
            {
                'A': np.nan,
                'B': 2.,
                'C': 2.
            },
            {
                'A': np.nan,
                'B': 3.,
                'C': 3.
            },
            {
                'A': np.nan,
                'B': 4.,
                'C': 4.
            },
        ])
        expected_result = pd.DataFrame([
            {
                'A': EPSILON,
                'B': 2.,
                'C': 2.
            },
            {
                'A': 0.,
                'B': 3.,
                'C': 3.
            },
            {
                'A': 0.,
                'B': 4.,
                'C': 4.
            },
        ])

        # Run
        result = self.modeler.impute_table(table)

        # Check
        assert result.equals(expected_result)

        # No null values are left
        assert not result.isnull().all().all()

    def test_impute_table_constant_column(self):
        """impute_table adds EPSILON at the first element of a constant column."""
        # Setup
        table = pd.DataFrame([
            {
                'A': np.nan,
                'B': 10.,
                'C': 20.
            },
            {
                'A': 5.,
                'B': np.nan,
                'C': 20.
            },
            {
                'A': 5.,
                'B': 10.,
                'C': np.nan
            },
        ])
        expected_result = pd.DataFrame([
            {
                'A': 5. + EPSILON,
                'B': 10. + EPSILON,
                'C': 20. + EPSILON
            },
            {
                'A': 5.,
                'B': 10.,
                'C': 20.
            },
            {
                'A': 5.,
                'B': 10.,
                'C': 20.
            },
        ])

        # Run
        result = self.modeler.impute_table(table)

        # Check
        assert result.equals(expected_result)

        # No null values are left
        assert not result.isnull().all().all()

    def test_get_foreign_key(self):
        """get_foreign_key returns the foreign key from a metadata and a primary key."""
        # Setup
        fields = self.modeler.dn.get_meta_data('DEMO_ORDERS')['fields']
        primary = 'CUSTOMER_ID'
        expected_result = 'CUSTOMER_ID'

        # Run
        result = self.modeler.get_foreign_key(fields, primary)

        # Check
        assert result == expected_result

    def test_fit_model_distribution_arg(self):
        """fit_model will pass self.distribution FQN to modeler."""
        # Setup
        model_mock = MagicMock()
        model_mock.__eq__.return_value = True
        model_mock.__ne__.return_value = False
        modeler = Modeler(data_navigator='navigator',
                          model=model_mock,
                          distribution=KDEUnivariate)
        data = pd.DataFrame({
            'column': [0, 1, 1, 1, 0],
        })

        # Run
        modeler.fit_model(data)

        # Check
        model_mock.assert_called_once_with(
            distribution='copulas.univariate.kde.KDEUnivariate')

    def test_model_database(self):
        """model_database computes conditions between tables and models them."""
        # Run
        self.modeler.model_database()

        # Check
        assert self.modeler.tables.keys() == self.modeler.models.keys()

    def test_model_database_gaussian_copula_single_table(self):
        """model_database can model a single table using the gausian copula model."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = Modeler(data_navigator=data_navigator,
                          model=GaussianMultivariate)

        # Setup - Mocks - DataNavigator
        table_data = pd.DataFrame({
            'column_A': list('abdc'),
            'column_B': range(4)
        })
        table_metadata = {
            'name': 'table_name',
            'fields': {
                'column_A': {
                    'name': 'column_A',
                    'type': 'categorical'
                },
                'column_B': {
                    'name': 'column_B',
                    'type': 'number',
                    'subtype': 'integer'
                }
            }
        }

        data_navigator.tables = {
            'table_name': Table(table_data, table_metadata)
        }
        data_navigator.get_parents.return_value = set()
        data_navigator.get_children.return_value = set()
        data_navigator.transformed_data = {
            'table_name':
            pd.DataFrame({
                'column_A': [0.1, 0.2, 0.5, 1.0],
                'column_B': range(4)
            })
        }
        metadata = {
            'name':
            'table_name',
            'fields': [{
                'name': 'column_A',
                'type': 'categorical'
            }, {
                'name': 'column_B',
                'type': 'number',
                'subtype': 'integer'
            }]
        }

        data_navigator.meta = {'tables': [metadata]}
        data_navigator.ht = MagicMock()
        data_navigator.ht.transformers = {
            ('table_name', 'column_A'): None,
            ('table_name', 'column_B'): None
        }

        # Run
        modeler.model_database()

        # Check
        assert 'table_name' in modeler.models

        sampler = Sampler(data_navigator, modeler)
        samples = sampler.sample_all()
        assert 'table_name' in samples

    @patch('sdv.modeler.Modeler.RCPA')
    def test_model_database_raises(self, rcpa_mock):
        """If the models raise an exception, it prints a custom message."""
        # Setup
        data_navigator = MagicMock()
        modeler = Modeler(data_navigator)

        data_navigator.tables = ['table_1', 'table_2']
        data_navigator.get_parents.return_value = False
        rcpa_mock.side_effect = ValueError('value error!')

        # Run / Check
        with self.assertRaises(ValueError):
            modeler.model_database()

    def test_model_database_kde_distribution(self):
        """model_database works fine with kde distribution."""
        # Setup
        modeler = Modeler(data_navigator=self.dn, distribution=KDEUnivariate)

        # Run
        modeler.model_database()

    @skip('s')
    def test_model_database_vine_modeler_single_table(self):
        """model_database works fine with vine modeler."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = Modeler(data_navigator=data_navigator, model=VineCopula)

        # Setup - Mock
        data = pd.DataFrame({'column_A': list('abdc'), 'column_B': range(4)})
        meta = {
            'name': 'table_name',
            'fields': {
                'column_A': {
                    'name': 'A',
                    'type': 'categorical'
                },
                'column_B': {
                    'name': 'B',
                    'type': 'number',
                    'subtype': 'integer'
                }
            }
        }

        data_navigator.tables = {'table_name': Table(data, meta)}
        data_navigator.get_parents.return_value = set()
        data_navigator.get_children.return_value = set()
        data_navigator.transformed_data = {
            'table_name':
            pd.DataFrame({
                'column_A': [0.1, 0.2, 0.5, 1.0],
                'column_B': range(4)
            })
        }
        data_navigator.meta = {'tables': [{'name': meta}]}
        data_navigator.ht = MagicMock()
        data_navigator.ht.transformers = {
            ('table_name', 'column_A'): None,
            ('table_name', 'column_B'): None
        }

        # Run
        modeler.model_database()

        # Check
        assert 'table_name' in modeler.models

        sampler = Sampler(data_navigator, modeler)
        samples = sampler.sample_all()
        assert 'table_name' in samples

    def test__flatten_dict_flat_dict(self):
        """_flatten_dict don't modify flat dicts."""
        # Setup
        nested_dict = {'a': 1, 'b': 2}
        expected_result = {'a': 1, 'b': 2}

        # Run
        result = Modeler._flatten_dict(nested_dict)

        # Check
        assert result == expected_result

    def test__flatten_dict_nested_dict(self):
        """_flatten_dict flatten nested dicts respecting the prefixes."""
        # Setup
        nested_dict = {'first_key': {'a': 1, 'b': 2}, 'second_key': {'x': 0}}

        expected_result = {
            'first_key__a': 1,
            'first_key__b': 2,
            'second_key__x': 0
        }

        # Run
        result = Modeler._flatten_dict(nested_dict)

        # Check
        assert result == expected_result

    def test__flatten_dict_missing_keys_gh_89(self):
        """flatten_dict will only ignore keys that don't have dict or list values.

        https://github.com/HDI-Project/SDV/issues/89
        """
        # Setup
        nested_dict = {
            'covariance':
            [[1.4999999999999991, 1.4999999999999991, 1.4999999999999991],
             [1.4999999999999991, 1.4999999999999991, 1.4999999999999991],
             [1.4999999999999991, 1.4999999999999991, 1.4999999999999991]],
            'distribs': {
                'type': {
                    'type': 'copulas.univariate.gaussian.GaussianUnivariate',
                    'fitted': True,
                    'mean': 4.0,
                    'std': 2.449489742783178
                },
                'distribution': {
                    'type': 'copulas.univariate.gaussian.GaussianUnivariate',
                    'fitted': True,
                    'mean': 5.0,
                    'std': 2.449489742783178
                },
                'fitted': {
                    'type': 'copulas.univariate.gaussian.GaussianUnivariate',
                    'fitted': True,
                    'mean': 6.0,
                    'std': 2.449489742783178
                }
            },
            'type':
            'copulas.multivariate.gaussian.GaussianMultivariate',
            'fitted':
            True,
            'distribution':
            'copulas.univariate.gaussian.GaussianUnivariate'
        }
        expected_result = {
            'covariance__0__0': 1.4999999999999991,
            'covariance__0__1': 1.4999999999999991,
            'covariance__0__2': 1.4999999999999991,
            'covariance__1__0': 1.4999999999999991,
            'covariance__1__1': 1.4999999999999991,
            'covariance__1__2': 1.4999999999999991,
            'covariance__2__0': 1.4999999999999991,
            'covariance__2__1': 1.4999999999999991,
            'covariance__2__2': 1.4999999999999991,
            'distribs__type__mean': 4.0,
            'distribs__type__std': 2.449489742783178,
            'distribs__distribution__mean': 5.0,
            'distribs__distribution__std': 2.449489742783178,
            'distribs__fitted__mean': 6.0,
            'distribs__fitted__std': 2.449489742783178
        }

        # Run
        result = Modeler._flatten_dict(nested_dict)

        # Check
        assert result == expected_result

    def test__flatten_array_ndarray(self):
        """_flatten_array return a dict formed from the input np.array"""
        # Setup
        nested = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
        expected_result = {
            '0__0': 1,
            '0__1': 0,
            '0__2': 0,
            '1__0': 0,
            '1__1': 1,
            '1__2': 0,
            '2__0': 0,
            '2__1': 0,
            '2__2': 1
        }

        # Run
        result = Modeler._flatten_array(nested)

        # Check
        assert result == expected_result

    def test__flatten_array_list(self):
        """_flatten_array return a dict formed from the input list"""
        # Setup
        nested = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
        expected_result = {
            '0__0': 1,
            '0__1': 0,
            '0__2': 0,
            '1__0': 0,
            '1__1': 1,
            '1__2': 0,
            '2__0': 0,
            '2__1': 0,
            '2__2': 1
        }

        # Run
        result = Modeler._flatten_array(nested)

        # Check
        assert result == expected_result
示例#9
0
文件: sdv.py 项目: ush19/SDV
class SDV:
    """Class to do modeling and sampling all in one.

    Args:
        meta_file_name (str): Path to the metadata file.
        data_loader_type (str):
        model (type): Class of model to use.
        distribution (type): Class of distribution to use. Will be deprecated shortly.
        model_kwargs (dict): Keyword arguments to pass to model.

    """
    def __init__(self,
                 meta_file_name,
                 data_loader_type='csv',
                 model=DEFAULT_MODEL,
                 distribution=None,
                 model_kwargs=None):
        self.meta_file_name = meta_file_name
        self.sampler = None
        self.model = model
        self.distribution = distribution
        self.model_kwargs = model_kwargs

    def _check_unsupported_dataset_structure(self):
        """Checks that no table has two parents."""
        tables = self.dn.tables.keys()
        amount_parents = [
            len(self.dn.get_parents(table)) <= 1 for table in tables
        ]
        if not all(amount_parents):
            raise ValueError(
                'Some tables have multiple parents, which is not supported yet.'
            )

    def fit(self):
        """Transform the data and model the database.

        Raises:
            ValueError: If the provided dataset has an unsupported structure.
        """
        data_loader = CSVDataLoader(self.meta_file_name)
        self.dn = data_loader.load_data()

        self._check_unsupported_dataset_structure()

        self.dn.transform_data()
        self.modeler = Modeler(data_navigator=self.dn,
                               model=self.model,
                               distribution=self.distribution,
                               model_kwargs=self.model_kwargs)
        self.modeler.model_database()
        self.sampler = Sampler(self.dn, self.modeler)

    def sample_rows(self, table_name, num_rows, reset_primary_keys=False):
        """Sample `num_rows` rows from the given table.

        Args:
            table_name(str): Name of the table to sample from.
            num_rows(int): Amount of rows to sample.
            reset_primary_keys(bool): Wheter or not reset the pk generators.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_rows(table_name,
                                        num_rows,
                                        reset_primary_keys=reset_primary_keys)

    def sample_table(self, table_name, reset_primary_keys=False):
        """Samples the given table to its original size.

        Args:
            table_name (str): Table to sample.
            reset_primary_keys(bool): Wheter or not reset the pk generators.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_table(table_name,
                                         reset_primary_keys=reset_primary_keys)

    def sample_all(self, num_rows=5, reset_primary_keys=False):
        """Sample the whole dataset.

        Args:
            num_rows(int): Amount of rows to sample.
            reset_primary_keys(bool): Wheter or not reset the pk generators.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_all(num_rows,
                                       reset_primary_keys=reset_primary_keys)

    def save(self, filename):
        """Save SDV instance to file destination.

        Args:
            file_destination(str): Path to store file.
        """
        with open(filename, 'wb') as output:
            pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)

    @classmethod
    def load(cls, filename):
        """Load a SDV instance from the given path.

        Args:
            filename(str): Path to load model.

        """
        with open(filename, 'rb') as f:
            instance = pickle.load(f)

        return instance
示例#10
0
class SDV:
    """Automated generative modeling and sampling tool.

    Allows the users to generate synthetic data after creating generative models for their data.

    Args:
        model (type):
            Class of the ``copula`` to use. Defaults to
            ``sdv.models.copulas.GaussianCopula``.
        model_kwargs (dict):
            Keyword arguments to pass to the model. Defaults to ``None``.
    """

    sampler = None

    def __init__(self, model=DEFAULT_MODEL, model_kwargs=None):
        self.model = model
        if model_kwargs is None:
            self.model_kwargs = DEFAULT_MODEL_KWARGS.copy()
        else:
            self.model_kwargs = model_kwargs

    def fit(self, metadata, tables=None, root_path=None):
        """Fit this SDV instance to the dataset data.

        Args:
            metadata (dict, str or Metadata):
                Metadata dict, path to the metadata JSON file or Metadata instance itself.
            tables (dict):
                Dictionary with the table names as key and ``pandas.DataFrame`` instances as
                values.  If ``None`` is given, the tables will be loaded from the paths
                indicated in ``metadata``. Defaults to ``None``.
            root_path (str or None):
                Path to the dataset directory. If ``None`` and metadata is
                a path, the metadata location is used. If ``None`` and
                metadata is a dict, the current working directory is used.
        """

        if isinstance(metadata, Metadata):
            self.metadata = metadata
        else:
            self.metadata = Metadata(metadata, root_path)

        self.metadata.validate(tables)

        self.modeler = Modeler(self.metadata, self.model, self.model_kwargs)
        self.modeler.model_database(tables)
        self.sampler = Sampler(self.metadata, self.modeler.models, self.model,
                               self.model_kwargs)

    def sample(self,
               table_name,
               num_rows,
               sample_children=True,
               reset_primary_keys=False):
        """Sample ``num_rows`` rows from the indicated table.

        Args:
            table_name (str):
                Name of the table to sample from.
            num_rows (int):
                Amount of rows to sample.
            sample_children (bool):
                Whether or not to sample children tables. Defaults to ``True``.
            reset_primary_keys (bool):
                Wheter or not reset the primary key generators. Defaults to ``False``.

        Returns:
            pandas.DataFrame:
                Sampled data with the number of rows specified in ``num_rows``.

        Raises:
            NotFittedError:
                A ``NotFittedError`` is raised when the ``SDV`` instance has not been fitted yet.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample(table_name,
                                   num_rows,
                                   sample_children=sample_children,
                                   reset_primary_keys=reset_primary_keys)

    def sample_all(self, num_rows=5, reset_primary_keys=False):
        """Sample the entire dataset.

        Args:
            num_rows (int):
                Amount of rows to sample. Defaults to ``5``.
            reset_primary_keys (bool):
                Wheter or not reset the primary key generators. Defaults to ``False``.

        Returns:
            dict:
                Tables sampled.

        Raises:
            NotFittedError:
                A ``NotFittedError`` is raised when the ``SDV`` instance has not been fitted yet.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_all(num_rows,
                                       reset_primary_keys=reset_primary_keys)

    def save(self, path):
        """Save this SDV instance to the given path using pickle.

        Args:
            path (str):
                Path where the SDV instance will be serialized.
        """
        with open(path, 'wb') as output:
            pickle.dump(self, output)

    @classmethod
    def load(cls, path):
        """Load a SDV instance from a given path.

        Args:
            path (str):
                Path from which to load the SDV instance.
        """
        with open(path, 'rb') as f:
            return pickle.load(f)
示例#11
0
class SDV:
    """Class to do modeling and sampling all in one.

    Args:
        meta_file_name (str): Path to the metadata file.
        data_loader_type (str)
    """
    def __init__(self, meta_file_name, data_loader_type='csv'):
        self.meta_file_name = meta_file_name
        self.sampler = None

    def _check_unsupported_dataset_structure(self):
        """Checks that no table has two parents."""
        tables = self.dn.tables.keys()
        amount_parents = [
            len(self.dn.get_parents(table)) <= 1 for table in tables
        ]
        if not all(amount_parents):
            raise ValueError(
                'Some tables have multiple parents, which is not supported yet.'
            )

    def fit(self):
        """Transform the data and model the database.

        Raises:
            ValueError: If the provided dataset has an unsupported structure.
        """
        data_loader = CSVDataLoader(self.meta_file_name)
        self.dn = data_loader.load_data()

        self._check_unsupported_dataset_structure()

        self.dn.transform_data()
        self.modeler = Modeler(self.dn)
        self.modeler.model_database()
        self.sampler = Sampler(self.dn, self.modeler)

    def sample_rows(self, table_name, num_rows):
        """Sample `num_rows` rows from the given table.

        Args:
            table_name(str): Name of the table to sample from.
            num_rows(int): Amount of rows to sample.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_rows(table_name, num_rows)

    def sample_table(self, table_name):
        """Samples the given table to its original size.

        Args:
            table_name (str): Table to sample.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_table(table_name)

    def sample_all(self, num_rows=5):
        """Sample the whole dataset.

        Args:
            num_rows (int): Amount of rows to sample.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_all(num_rows)

    def save(self, filename):
        """Save SDV instance to file destination.

        Args:
            file_destination (string): path to store file.
        """
        with open(filename, 'wb') as output:
            pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)
示例#12
0
    def test_model_database_vine_modeler_single_table(self):
        """model_database works fine with vine modeler."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = Modeler(data_navigator=data_navigator, model=VineCopula)

        # Setup - Mock
        data = pd.DataFrame({'column_A': list('abdc'), 'column_B': range(4)})
        meta = {
            'name': 'table_name',
            'fields': {
                'column_A': {
                    'name': 'column_A',
                    'type': 'categorical'
                },
                'column_B': {
                    'name': 'column_B',
                    'type': 'number',
                    'subtype': 'integer'
                }
            }
        }

        data_navigator.tables = {'table_name': Table(data, meta)}
        data_navigator.get_parents.return_value = set()
        data_navigator.get_children.return_value = set()
        data_navigator.transformed_data = {
            'table_name':
            pd.DataFrame({
                'column_A': [0.1, 0.2, 0.5, 1.0],
                'column_B': range(4)
            })
        }
        metadata = {
            'name':
            'table_name',
            'fields': [{
                'name': 'column_A',
                'type': 'categorical'
            }, {
                'name': 'column_B',
                'type': 'number',
                'subtype': 'integer'
            }]
        }

        data_navigator.meta = {'tables': [metadata]}

        ht = MagicMock(spec=HyperTransformer)
        ht.transformers = {
            ('table_name', 'column_A'): None,
            ('table_name', 'column_B'): None
        }

        reverse_transform_dataframe = pd.DataFrame(
            {
                'column_A': list('bcda'),
                'column_B': [1.0, 2.0, 3.0, 4.0]
            },
            columns=['column_A', 'column_B'])
        ht.reverse_transform_table.return_value = reverse_transform_dataframe

        data_navigator.ht = ht

        # Run
        modeler.model_database()

        # Check
        assert len(modeler.models) == 1
        model = modeler.models['table_name']
        assert isinstance(model, VineCopula)
        assert model.fitted is True

        assert data_navigator.get_parents.call_args_list == [
            (('table_name', ), )
        ]
        assert data_navigator.get_children.call_args_list == [
            (('table_name', ), ), (('table_name', ), )
        ]
        assert modeler.tables['table_name'].equals(
            modeler.dn.transformed_data['table_name'])
示例#13
0
    def test_model_database_kde_distribution(self):
        """model_database works fine with kde distribution."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = Modeler(data_navigator=data_navigator,
                          distribution=KDEUnivariate)

        # Setup - Mocks - DataNavigator
        table_data = pd.DataFrame({
            'column_A': list('abdc'),
            'column_B': range(4)
        })
        table_metadata = {
            'name': 'table_name',
            'fields': {
                'column_A': {
                    'name': 'column_A',
                    'type': 'categorical'
                },
                'column_B': {
                    'name': 'column_B',
                    'type': 'number',
                    'subtype': 'integer'
                }
            }
        }

        data_navigator.tables = {
            'table_name': Table(table_data, table_metadata)
        }
        data_navigator.get_parents.return_value = set()
        data_navigator.get_children.return_value = set()
        data_navigator.transformed_data = {
            'table_name':
            pd.DataFrame({
                'column_A': [0.1, 0.2, 0.5, 1.0],
                'column_B': range(4)
            })
        }
        metadata = {
            'name':
            'table_name',
            'fields': [{
                'name': 'column_A',
                'type': 'categorical'
            }, {
                'name': 'column_B',
                'type': 'number',
                'subtype': 'integer'
            }]
        }

        data_navigator.meta = {'tables': [metadata]}
        ht = MagicMock(spec=HyperTransformer)
        ht.transformers = {
            ('table_name', 'column_A'): None,
            ('table_name', 'column_B'): None
        }

        reverse_transform_dataframe = pd.DataFrame(
            {
                'column_A': list('bcda'),
                'column_B': [1.0, 2.0, 3.0, 4.0]
            },
            columns=['column_A', 'column_B'])
        ht.reverse_transform_table.return_value = reverse_transform_dataframe

        data_navigator.ht = ht

        # Run
        modeler.model_database()

        # Check
        assert len(modeler.models) == 1
        assert 'table_name' in modeler.models
        model = modeler.models['table_name']

        assert isinstance(model, GaussianMultivariate)
        assert model.distribution == 'copulas.univariate.kde.KDEUnivariate'
        assert model.fitted is True

        assert data_navigator.get_parents.call_args_list == [
            (('table_name', ), )
        ]
        assert data_navigator.get_children.call_args_list == [
            (('table_name', ), ), (('table_name', ), )
        ]
示例#14
0
class ModelerTest(TestCase):
    def setUp(self):
        """Set up test fixtures, if any."""
        dl = CSVDataLoader('tests/data/meta.json')
        self.dn = dl.load_data()
        self.dn.transform_data()
        self.modeler = Modeler(self.dn)

    def test__create_extension(self):
        """Tests that the create extension method returns correct parameters."""
        # Setup
        child_table = self.dn.get_data('DEMO_ORDERS')
        user = child_table[child_table['CUSTOMER_ID'] == 50]
        expected = pd.Series([
            1.500000e+00, 0.000000e+00, -1.269991e+00, 0.000000e+00,
            0.000000e+00, 0.000000e+00, -1.269991e+00, 0.000000e+00,
            1.500000e+00, 0.000000e+00, 0.000000e+00, -7.401487e-17,
            1.000000e+00, 7.000000e+00, 2.449490e+00, 4.000000e+00,
            5.000000e+01, 5.000000e+01, 1.000000e-03, 5.000000e+01,
            7.300000e+02, 2.380000e+03, 7.618545e+02, 1.806667e+03
        ])

        # Run
        parameters = self.modeler._create_extension(user, child_table)

        # Check
        assert expected.subtract(parameters).all() < 10E-3

    def test__get_extensions(self):
        """_get_extensions returns a works for table with child"""
        # Setup
        pk = 'ORDER_ID'
        table = 'DEMO_ORDERS'
        children = self.dn.get_children(table)

        # Run
        result = self.modeler._get_extensions(pk, children, table)

        # Check
        assert len(result) == 1
        assert result[0].shape == (10, 35)

    def test_get_extensions_no_children(self):
        """Tests that get extensions works for table with no children."""
        # Setup
        pk = 'ORDER_ITEM_ID'
        table = 'DEMO_ORDER_ITEMS'
        children = self.dn.get_children(table)
        expected_result = []

        # Run
        result = self.modeler._get_extensions(pk, children, table)

        # Check
        assert result == expected_result

    def test_CPA(self):
        """ """
        # Setup
        self.modeler.model_database()
        table_name = 'DEMO_CUSTOMERS'

        # Run
        self.modeler.CPA(table_name)

        # Check
        for name, table in self.modeler.tables.items():
            with self.subTest(table=name):
                raw_table = self.modeler.dn.tables[name].data

                # When we run Conditional Parameter Aggregation we add a key on Modeler.tables
                # for each table. It contains a not null pandas DataFrame with the computed
                # extension.
                assert isinstance(table, pd.DataFrame)

                assert raw_table.shape[0] == table.shape[0]
                assert (raw_table.index == table.index).all()
                assert all(
                    [column in table.columns for column in raw_table.columns])

    def test_flatten_model(self):
        """flatten_model returns a pandas.Series with all the params to recreate a model."""
        # Setup
        for data in self.dn.transformed_data.values():
            num_columns = data.shape[1]
            model = self.modeler.model()
            model.fit(data)

            # We generate it this way because RDT behavior is not fully deterministic
            # and transformed data can change between test runs.
            distribs_values = np.array(
                [[col_model.std, col_model.mean]
                 for col_model in model.distribs.values()]).flatten()

            expected_result = pd.Series(
                list(model.covariance.flatten()) + list(distribs_values))

            # Run
            result = self.modeler.flatten_model(model)

            # Check
            assert (result == expected_result).all()
            assert len(result) == num_columns**2 + (2 * num_columns)

    def test_impute_table(self):
        """impute_table fills all NaN values with 0 or the mean of values."""
        # Setup
        table = pd.DataFrame([
            {
                'A': np.nan,
                'B': 10.,
                'C': 20.
            },
            {
                'A': 5.,
                'B': np.nan,
                'C': 20.
            },
            {
                'A': 5.,
                'B': 10.,
                'C': np.nan
            },
        ])
        expected_result = pd.DataFrame([
            {
                'A': 5.,
                'B': 10.,
                'C': 20.
            },
            {
                'A': 5.,
                'B': 10.,
                'C': 20.
            },
            {
                'A': 5.,
                'B': 10.,
                'C': 20.
            },
        ])

        # Run
        result = self.modeler.impute_table(table)

        # Check
        assert result.equals(expected_result)

        # No null values are left
        assert not result.isnull().all().all()

        # Averages are computed on every column
        for column in result:
            assert 0 not in result[column].values

    def test_model_database(self):
        """model_database computes conditions between tables and models them."""

        # Run
        self.modeler.model_database()

        # Check
        assert self.modeler.tables.keys() == self.modeler.models.keys()

    def test_get_foreign_key(self):
        """get_foreign_key returns the foreign key from a metadata and a primary key."""
        # Setup
        fields = self.modeler.dn.get_meta_data('DEMO_ORDERS')['fields']
        primary = 'CUSTOMER_ID'
        expected_result = 'CUSTOMER_ID'

        # Run
        result = self.modeler.get_foreign_key(fields, primary)

        # Check
        assert result == expected_result