示例#1
0
文件: test_sampler.py 项目: ush19/SDV
    def test__get_missing_valid_rows_excess_rows(self):
        """If more rows than required are passed, the result is cut to num_rows."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = MagicMock(spec=Modeler)
        sampler = Sampler(data_navigator, modeler)

        synthesized = pd.DataFrame(columns=list('AB'), index=range(3, 7))
        drop_indices = pd.Series(False, index=range(3, 7))
        valid_rows = pd.DataFrame(columns=list('AB'), index=range(2))
        num_rows = 5

        # Run
        result = sampler._get_missing_valid_rows(synthesized, drop_indices,
                                                 valid_rows, num_rows)
        missing_rows, valid_rows = result

        # Check
        assert missing_rows == 0
        assert valid_rows.equals(
            pd.DataFrame(columns=list('AB'), index=range(5)))

        data_navigator.assert_not_called()
        assert data_navigator.method_calls == []

        modeler.assert_not_called()
        assert modeler.method_calls == []
示例#2
0
    def fit(self, metadata, tables=None, root_path=None):
        """Fit this SDV instance to the dataset data.

        Args:
            metadata (dict, str or Metadata):
                Metadata dict, path to the metadata JSON file or Metadata instance itself.
            tables (dict):
                Dictionary with the table names as key and ``pandas.DataFrame`` instances as
                values.  If ``None`` is given, the tables will be loaded from the paths
                indicated in ``metadata``. Defaults to ``None``.
            root_path (str or None):
                Path to the dataset directory. If ``None`` and metadata is
                a path, the metadata location is used. If ``None`` and
                metadata is a dict, the current working directory is used.
        """

        if isinstance(metadata, Metadata):
            self.metadata = metadata
        else:
            self.metadata = Metadata(metadata, root_path)

        self.metadata.validate(tables)

        self.modeler = Modeler(self.metadata, self.model, self.model_kwargs)
        self.modeler.model_database(tables)
        self.sampler = Sampler(self.metadata, self.modeler.models, self.model,
                               self.model_kwargs)
示例#3
0
文件: test_sampler.py 项目: Aylr/SDV
    def test__unflatten_dict_child_name(self):
        """unflatten_dict will respect the name of child tables."""
        # Setup
        data_navigator = MagicMock()
        data_navigator.get_children.return_value = ['CHILD_TABLE']
        modeler = MagicMock()
        sampler = Sampler(data_navigator, modeler)

        flat = {
            'first_key__a': 1,
            'first_key____CHILD_TABLE__model_param': 0,
            'distribs____CHILD_TABLE__distribs__UNIT_PRICE__std__mean': 0
        }
        table_name = 'TABLE_NAME'
        expected_result = {
            'first_key': {
                'a': 1,
                '__CHILD_TABLE': {
                    'model_param': 0
                }
            },
            'distribs': {
                '__CHILD_TABLE__distribs__UNIT_PRICE__std': {
                    'mean': 0
                }
            }
        }

        # Run
        result = sampler._unflatten_dict(flat, table_name)

        # Check
        assert result == expected_result
        modeler.assert_not_called()
        data_navigator.get_children.assert_called_once_with('TABLE_NAME')
示例#4
0
文件: test_sampler.py 项目: Aylr/SDV
    def test_sample_all(self, rows_mock, child_mock, reset_mock, concat_mock):
        """Check sample_all and returns some value."""
        # Setup
        data_navigator = MagicMock()
        data_navigator.tables = ['TABLE_A', 'TABLE_B']
        data_navigator.get_parents.side_effect = lambda x: x != 'TABLE_A'
        modeler = MagicMock()
        sampler = Sampler(data_navigator, modeler)

        def fake_dataframe(name, number):
            return pd.DataFrame([{name: 0} for i in range(number)], index=[0]*number)

        rows_mock.side_effect = fake_dataframe
        concat_mock.return_value = 'concatenated_dataframe'

        expected_get_parents_call_list = [(('TABLE_A',), {}), (('TABLE_B',), {})]
        expected_rows_mock_call_list = [(('TABLE_A', 1), {}) for i in range(5)]

        # Run
        result = sampler.sample_all(num_rows=5)

        # Check
        assert data_navigator.get_parents.call_args_list == expected_get_parents_call_list
        assert result == reset_mock.return_value

        assert rows_mock.call_args_list == expected_rows_mock_call_list
        assert child_mock.call_count == 5
        reset_mock.assert_called_once_with({'TABLE_A': 'concatenated_dataframe'})
示例#5
0
文件: test_sampler.py 项目: ush19/SDV
    def test__sample_model(self, qualified_mock):
        """_sample_model sample the number of rows from the given model."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = MagicMock(spec=Modeler)

        sampler = Sampler(data_navigator, modeler)
        model = MagicMock()
        values = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])

        qualified_mock.return_value = 'package.module.full_qualified_name'

        model.sample.return_value = values
        num_rows = 3
        columns = list('ABC')

        expected_result = pd.DataFrame(values, columns=columns)

        # Run
        result = sampler._sample_model(model, num_rows, columns)

        # Check
        assert result.equals(expected_result)

        qualified_mock.assert_called_once_with(model)
        model.sample.assert_called_once_with(3)
示例#6
0
文件: test_sampler.py 项目: ush19/SDV
    def test__unflatten_dict_respect_covariance_matrix(self):
        """unflatten_dict restructures the covariance matrix into an square matrix."""
        # Setup
        data_navigator = MagicMock()
        modeler = MagicMock()
        sampler = Sampler(data_navigator, modeler)

        def fake_values(i, j):
            return '{}, {}'.format(i, j)

        expected_result = {
            'covariance':
            np.array([[fake_values(i, j) for j in range(40)]
                      for i in range(40)]).tolist()
        }

        flat = {
            'covariance__{}__{}'.format(i, j): fake_values(i, j)
            for i in range(40) for j in range(40)
        }

        # Run
        result = sampler._unflatten_dict(flat)

        # Check
        assert result == expected_result
示例#7
0
文件: test_sampler.py 项目: ush19/SDV
    def test_sample_table(self, rows_mock):
        """ """
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        data_navigator.tables = {
            'table': MagicMock(**{'data.shape': ('rows', 'columns')})
        }
        modeler = MagicMock(spec=Modeler)
        sampler = Sampler(data_navigator=data_navigator, modeler=modeler)

        rows_mock.return_value = {'table': 'samples'}

        table_name = 'table'
        reset_primary_keys = False

        expected_result = 'samples'

        # Run
        result = sampler.sample_table(table_name,
                                      reset_primary_keys=reset_primary_keys)

        # Check
        assert result == expected_result

        rows_mock.assert_called_once_with(sampler,
                                          'table',
                                          'rows',
                                          sample_children=False,
                                          reset_primary_keys=False)
示例#8
0
文件: test_sampler.py 项目: ush19/SDV
    def test__unflatten_dict(self):
        """unflatten_dict restructure flatten dicts."""
        # Setup
        data_navigator = MagicMock()
        modeler = MagicMock()
        sampler = Sampler(data_navigator, modeler)
        flat = {
            'a__first_key__a': 1,
            'a__first_key__b': 2,
            'b__second_key__x': 0
        }

        expected_result = {
            'a': {
                'first_key': {
                    'a': 1,
                    'b': 2
                },
            },
            'b': {
                'second_key': {
                    'x': 0
                },
            }
        }

        # Run
        result = sampler._unflatten_dict(flat)

        # Check
        assert result == expected_result
        data_navigator.assert_not_called()
        modeler.assert_not_called()
示例#9
0
文件: test_sampler.py 项目: ush19/SDV
    def test__unflatten_dict_child_name(self):
        """unflatten_dict will respect the name of child tables."""
        # Setup
        data_navigator = MagicMock()
        modeler = MagicMock()
        sampler = Sampler(data_navigator, modeler)

        flat = {
            'first_key__a__b': 1,
            'first_key____CHILD_TABLE__model_param': 0,
            'distribs____CHILD_TABLE__distribs__UNIT_PRICE__std__mean': 0
        }
        expected_result = {
            'first_key': {
                'a': {
                    'b': 1
                },
                '__CHILD_TABLE': {
                    'model_param': 0
                }
            },
            'distribs': {
                '__CHILD_TABLE__distribs__UNIT_PRICE__std': {
                    'mean': 0
                }
            }
        }

        # Run
        result = sampler._unflatten_dict(flat)

        # Check
        assert result == expected_result
        modeler.assert_not_called()
        data_navigator.assert_not_called()
示例#10
0
文件: test_sampler.py 项目: ush19/SDV
    def test__sample_model_vine(self, qualified_mock):
        """_sample_model sample the number of rows from the given model."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = MagicMock(spec=Modeler)

        sampler = Sampler(data_navigator, modeler)
        model = MagicMock()
        values = [
            np.array([1, 1, 1]),
            np.array([2, 2, 2]),
            np.array([3, 3, 3])
        ]

        qualified_mock.return_value = 'copulas.multivariate.vine.VineCopula'

        model.sample.side_effect = values
        num_rows = 3
        columns = list('ABC')

        expected_result = pd.DataFrame(values, columns=columns)

        # Run
        result = sampler._sample_model(model, num_rows, columns)

        # Check
        assert result.equals(expected_result)

        qualified_mock.assert_called_once_with(model)
        assert model.sample.call_args_list == [((3, ), ), ((3, ), ), ((3, ), )]
示例#11
0
文件: test_sampler.py 项目: ush19/SDV
    def test_sample_all(self, rows_mock):
        """Check sample_all and returns some value."""
        # Setup
        data_navigator = MagicMock()
        data_navigator.tables = ['TABLE_A', 'TABLE_B']
        data_navigator.get_parents.side_effect = lambda x: x != 'TABLE_A'
        modeler = MagicMock()
        sampler = Sampler(data_navigator, modeler)

        def fake_dataframe(*args, **kwargs):
            kwargs['sampled_data'][args[1]] = 'sampled_data'

        rows_mock.side_effect = fake_dataframe

        expected_get_parents_call_list = [(('TABLE_A', ), {}),
                                          (('TABLE_B', ), {})]
        expected_result = {'TABLE_A': 'sampled_data'}

        # Run
        result = sampler.sample_all(num_rows=5)

        # Check
        assert result == expected_result

        assert data_navigator.get_parents.call_args_list == expected_get_parents_call_list
        rows_mock.assert_called_once_with(
            sampler, 'TABLE_A', 5, sampled_data={'TABLE_A': 'sampled_data'})
示例#12
0
文件: test_sampler.py 项目: ush19/SDV
    def test__get_missing_valid_rows(self):
        """get_missing_valid_rows return an a dataframe and an integer.

        The dataframe contains valid_rows concatenated to synthesized and their index reset.
        The integer is the diference between num_rows and the returned dataframe rows.
        """
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = MagicMock(spec=Modeler)
        sampler = Sampler(data_navigator, modeler)

        synthesized = pd.DataFrame(columns=list('AB'), index=range(3, 5))
        drop_indices = pd.Series(False, index=range(3, 5))
        valid_rows = pd.DataFrame(columns=list('AB'), index=range(2))
        num_rows = 5

        # Run
        result = sampler._get_missing_valid_rows(synthesized, drop_indices,
                                                 valid_rows, num_rows)
        missing_rows, valid_rows = result

        # Check
        assert missing_rows == 1
        assert valid_rows.equals(
            pd.DataFrame(columns=list('AB'), index=[0, 1, 2, 3]))

        data_navigator.assert_not_called()
        assert data_navigator.method_calls == []

        modeler.assert_not_called()
        assert modeler.method_calls == []
示例#13
0
文件: test_sampler.py 项目: ush19/SDV
    def test_sample_rows_parent_table(self, primary_mock, parent_mock,
                                      sample_mock, update_mock, trans_mock):
        """sample_rows samples using modeler.models if the table hasn't parents."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = MagicMock(spec=Modeler)
        modeler.models = {'parent_table': 'model for parent table'}
        sampler = Sampler(data_navigator=data_navigator, modeler=modeler)

        primary_mock.return_value = ('primary_key', pd.Series(range(5)))
        parent_mock.return_value = None
        sample_mock.return_value = pd.DataFrame()
        update_mock.return_value = {'table_name': 'samples'}
        trans_mock.return_value = 'transformed rows'

        expected_result = {'parent_table': 'transformed rows'}

        # Run
        result = sampler.sample_rows('parent_table', 5)

        # Check
        assert result == expected_result
        assert sampler.sampled == {'table_name': 'samples'}

        primary_mock.assert_called_once_with(sampler, 'parent_table', 5)
        parent_mock.assert_called_once_with(sampler, 'parent_table')
        sample_mock.assert_called_once_with(sampler, 'model for parent table',
                                            5, 'parent_table')

        expected_sample_info = ('primary_key', sample_mock.return_value)
        update_mock.assert_called_once_with({}, 'parent_table',
                                            expected_sample_info)
        trans_mock.assert_called_once_with(sampler, sample_mock.return_value,
                                           'parent_table')
示例#14
0
    def test__get_model(self):
        """Test get model"""
        # Setup
        sampler = Mock(spec=Sampler)
        sampler._unflatten_dict.return_value = {'unflatten': 'dict'}
        sampler._unflatten_gaussian_copula.return_value = {
            'unflatten': 'gaussian'
        }
        table_model = Mock()
        table_model.to_dict.return_value = {
            'distribution':
            'copulas.multivariate.gaussian.GaussianMultivariate'
        }

        # Run
        extension = {'extension': 'dict'}
        Sampler._get_model(sampler, extension, table_model)

        # Asserts
        sampler._unflatten_dict.assert_called_once_with({'extension': 'dict'})

        expected_unflatten_gaussian_call = {
            'unflatten': 'dict',
            'fitted': True,
            'distribution':
            'copulas.multivariate.gaussian.GaussianMultivariate'
        }
        sampler._unflatten_gaussian_copula.assert_called_once_with(
            expected_unflatten_gaussian_call)

        table_model.from_dict.assert_called_once_with(
            {'unflatten': 'gaussian'})
示例#15
0
    def test_model_database_vine_modeler_single_table(self):
        """model_database works fine with vine modeler."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = Modeler(data_navigator=data_navigator, model=VineCopula)

        # Setup - Mock
        data = pd.DataFrame({
            'column_A': list('abdc'),
            'column_B': range(4)
        })
        meta = {
            'name': 'table_name',
            'fields': {
                'column_A': {
                    'name': 'A',
                    'type': 'categorical'
                },
                'column_B': {
                    'name': 'B',
                    'type': 'number',
                    'subtype': 'integer'
                }
            }
        }

        data_navigator.tables = {
            'table_name': Table(data, meta)
        }
        data_navigator.get_parents.return_value = set()
        data_navigator.get_children.return_value = set()
        data_navigator.transformed_data = {
            'table_name': pd.DataFrame({
                'column_A': [0.1, 0.2, 0.5, 1.0],
                'column_B': range(4)
            })
        }
        data_navigator.meta = {
            'tables': [
                {
                    'name': meta
                }
            ]
        }
        data_navigator.ht = MagicMock()
        data_navigator.ht.transformers = {
            ('table_name', 'column_A'): None,
            ('table_name', 'column_B'): None
        }

        # Run
        modeler.model_database()

        # Check
        assert 'table_name' in modeler.models

        sampler = Sampler(data_navigator, modeler)
        samples = sampler.sample_all()
        assert 'table_name' in samples
示例#16
0
    def test__sample_children(self):
        """Test sample children"""
        # Setup
        sampler = Mock(spec=Sampler)
        sampler.metadata.get_children.return_value = [
            'child A', 'child B', 'child C'
        ]

        # Run
        sampled = {'test': pd.DataFrame({'field': [11, 22, 33]})}
        Sampler._sample_children(sampler, 'test', sampled)

        # Asserts
        sampler.metadata.get_children.assert_called_once_with('test')

        expected_calls = [
            [
                'child A', 'test',
                pd.Series([11], index=['field'], name=0), sampled
            ],
            [
                'child A', 'test',
                pd.Series([22], index=['field'], name=1), sampled
            ],
            [
                'child A', 'test',
                pd.Series([33], index=['field'], name=2), sampled
            ],
            [
                'child B', 'test',
                pd.Series([11], index=['field'], name=0), sampled
            ],
            [
                'child B', 'test',
                pd.Series([22], index=['field'], name=1), sampled
            ],
            [
                'child B', 'test',
                pd.Series([33], index=['field'], name=2), sampled
            ],
            [
                'child C', 'test',
                pd.Series([11], index=['field'], name=0), sampled
            ],
            [
                'child C', 'test',
                pd.Series([22], index=['field'], name=1), sampled
            ],
            [
                'child C', 'test',
                pd.Series([33], index=['field'], name=2), sampled
            ],
        ]
        actual_calls = sampler._sample_table.call_args_list
        for result_call, expected_call in zip(actual_calls, expected_calls):
            assert result_call[0][0] == expected_call[0]
            assert result_call[0][1] == expected_call[1]
            assert result_call[0][3] == expected_call[3]
            pd.testing.assert_series_equal(result_call[0][2], expected_call[2])
示例#17
0
文件: test_sampler.py 项目: Aylr/SDV
    def test__unflatten_gaussian_copula_negative_std(self):
        """_unflatten_gaussian_copula will transform negative or 0 std into positive."""
        # Setup
        data_navigator = MagicMock()
        modeler = MagicMock()
        modeler.model_kwargs = {
            'distribution': 'distribution_name'
        }
        sampler = Sampler(data_navigator, modeler)

        model_parameters = {
            'some': 'key',
            'covariance': [
                [1],
                [0, 1]
            ],
            'distribs': {
                0: {
                    'first': 'distribution',
                    'std': 0
                },
                1: {
                    'second': 'distribution',
                    'std': -1
                }
            }
        }
        expected_result = {
            'some': 'key',
            'distribution': 'distribution_name',
            'covariance': [
                [1, 0],
                [0, 1]
            ],
            'distribs': {
                0: {
                    'type': 'distribution_name',
                    'fitted': True,
                    'first': 'distribution',
                    'std': 1
                },
                1: {
                    'type': 'distribution_name',
                    'fitted': True,
                    'second': 'distribution',
                    'std': np.exp(-1)
                }
            }
        }

        # Run
        result = sampler._unflatten_gaussian_copula(model_parameters)

        # Check
        assert result == expected_result

        data_navigator.assert_not_called()
        modeler.assert_not_called()
示例#18
0
文件: test_sampler.py 项目: Aylr/SDV
    def test__unflatten_gaussian_copula(self):
        """_unflatten_gaussian_copula add the distribution, type and fitted kwargs."""
        # Setup
        data_navigator = MagicMock()
        modeler = MagicMock()
        modeler.model_kwargs = {
            'distribution': 'distribution_name'
        }
        sampler = Sampler(data_navigator, modeler)

        model_parameters = {
            'some': 'key',
            'covariance': [
                [1],
                [0, 1]
            ],
            'distribs': {
                0: {
                    'first': 'distribution',
                    'std': 0
                },
                1: {
                    'second': 'distribution',
                    'std': 0
                }
            }
        }
        expected_result = {
            'some': 'key',
            'distribution': 'distribution_name',
            'covariance': [
                [1, 0],
                [0, 1]
            ],
            'distribs': {
                0: {
                    'type': 'distribution_name',
                    'fitted': True,
                    'first': 'distribution',
                    'std': 1
                },
                1: {
                    'type': 'distribution_name',
                    'fitted': True,
                    'second': 'distribution',
                    'std': 1
                }
            }
        }

        # Run
        result = sampler._unflatten_gaussian_copula(model_parameters)

        # Check
        assert result == expected_result

        data_navigator.assert_not_called()
        modeler.assert_not_called()
示例#19
0
文件: sdv.py 项目: robertsievert/SDV
 def fit(self):
     """Transform the data and model the database."""
     data_loader = CSVDataLoader(self.meta_file_name)
     self.dn = data_loader.load_data()
     # transform data
     self.dn.transform_data()
     self.modeler = Modeler(self.dn)
     self.modeler.model_database()
     self.sampler = Sampler(self.dn, self.modeler)
示例#20
0
    def test_sample_no_sample_children(self):
        """Test sample no sample children"""
        # Setup
        sampler = Mock(spec=Sampler)
        sampler.models = {'test': 'model'}
        sampler.metadata.get_parents.return_value = None

        # Run
        Sampler.sample(sampler, 'test', 5, sample_children=False)
示例#21
0
    def test__unflatten_dict_raises_error_column_index(self):
        """Test unflatten dict raises error column_index"""
        # Setup
        sampler = Mock(spec=Sampler)
        flat = {'foo__1__0': 'some value'}

        # Run
        with pytest.raises(ValueError):
            Sampler._unflatten_dict(sampler, flat)
示例#22
0
    def test_sample_no_sample_children(self):
        """Test sample no sample children"""
        # Setup
        sampler = Mock(spec=Sampler)
        sampler.models = {'test': 'model'}
        sampler.metadata.get_parents.return_value = None

        # Run
        Sampler.sample(sampler, 'test', 5, sample_children=False)
        sampler._transform_synthesized_rows.assert_called_once_with(
            sampler._sample_rows.return_value, 'test')
示例#23
0
    def test__sample_with_previous(self):
        """Check _sample with previous"""

        # Setup
        get_extension_mock = Mock()
        get_extension_mock.return_value = {'child_rows': 0.999}

        get_model_mock = Mock()
        get_model_mock.return_value = None

        sample_valid_rows_mock = Mock()
        sample_valid_rows_mock.return_value = pd.DataFrame({'foo': [0, 1]})

        sample_children_mock = Mock()

        dn_mock = Mock()
        dn_mock.foreign_keys = {
            ('DEMO', 'p_name'): ('parent_id', 'foreign_key')
        }

        # Run
        sampler_mock = Mock()
        sampler_mock._get_extension = get_extension_mock
        sampler_mock._get_model = get_model_mock
        sampler_mock._sample_valid_rows = sample_valid_rows_mock
        sampler_mock._sample_children = sample_children_mock
        sampler_mock.dn = dn_mock

        table_name = 'DEMO'
        parent_name = 'p_name'
        parent_row = {'parent_id': 'foo'}
        sampled = {'DEMO': pd.DataFrame({'bar': [1, 2]})}

        Sampler._sample(sampler_mock, table_name, parent_name, parent_row,
                        sampled)

        # Asserts
        exp_dataframe_sampled = pd.DataFrame({
            'bar': [1, 2, np.NaN, np.NaN],
            'foo': [np.NaN, np.NaN, 0, 1],
            'foreign_key': [np.NaN, np.NaN, 'foo', 'foo']
        })
        args_sample_children, kwargs_sample_children = sample_children_mock.call_args
        exp_arg_table_name, exp_arg_sampled = args_sample_children

        get_extension_mock.assert_called_once_with({'parent_id': 'foo'},
                                                   'DEMO', 'p_name')
        get_model_mock.assert_called_once_with({'child_rows': 0.999})
        sample_valid_rows_mock.assert_called_once_with(None, 1, 'DEMO')

        assert exp_arg_table_name == 'DEMO'

        pd.testing.assert_frame_equal(exp_arg_sampled['DEMO'],
                                      exp_dataframe_sampled)
示例#24
0
    def test__reset_primary_keys_generators(self):
        """Test reset values"""
        # Run
        sampler = Mock()
        sampler.primary_key = 'something'
        sampler.remaining_primary_key = 'else'

        Sampler._reset_primary_keys_generators(sampler)

        # Asserts
        assert sampler.primary_key == dict()
        assert sampler.remaining_primary_key == dict()
示例#25
0
文件: test_sampler.py 项目: ush19/SDV
    def test_sample_rows_children_table(self, primary_mock, parent_mock,
                                        model_mock, extension_mock,
                                        sample_mock, update_mock, trans_mock):
        """sample_rows samples using extensions when the table has parents."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        data_navigator.foreign_keys = {
            ('child_table', 'parent_name'): ('parent_pk', 'child_fk')
        }
        modeler = MagicMock(spec=Modeler)
        sampler = Sampler(data_navigator=data_navigator, modeler=modeler)

        primary_mock.return_value = ('primary_key', pd.Series(range(5)))
        parent_mock.return_value = ('parent_name', 'foreign_key',
                                    pd.DataFrame({'foreign_key': [0, 1, 2]}))

        extension_mock.return_value = 'extension'
        model_mock.return_value = 'model from extension'
        sample_mock.return_value = pd.DataFrame()
        update_mock.return_value = {'table_name': 'samples'}
        trans_mock.return_value = 'transformed_rows'

        expected_result = {'child_table': 'transformed_rows'}

        # Run
        result = sampler.sample_rows('child_table', 5)

        # Check
        assert result == expected_result
        assert sampler.sampled == {'table_name': 'samples'}

        primary_mock.assert_called_once_with(sampler, 'child_table', 5)
        parent_mock.assert_called_once_with(sampler, 'child_table')
        sample_mock.assert_called_once_with(sampler, 'model from extension', 5,
                                            'child_table')

        expected_sample_info = ('primary_key', sample_mock.return_value)
        update_mock.assert_called_once_with({}, 'child_table',
                                            expected_sample_info)
        trans_mock.assert_called_once_with(sampler, sample_mock.return_value,
                                           'child_table')

        call_args_list = extension_mock.call_args_list
        assert len(call_args_list) == 1
        args, kwargs = call_args_list[0]
        assert kwargs == {}
        assert len(args) == 4
        assert args[0] == sampler
        assert args[1].equals(pd.DataFrame({'foreign_key': [0]}))
        assert args[2] == 'child_table'
        assert args[3] == 'parent_name'

        model_mock.assert_called_once_with(sampler, 'extension')
示例#26
0
    def test__fill_text_columns(self):
        """Fill columns"""

        # Setup
        data_navigator_mock = Mock()
        data_navigator_mock.tables = {
            'DEMO':
            Table(
                pd.DataFrame(), {
                    'fields': {
                        'a_field': {
                            'name': 'a_field',
                            'type': 'id',
                            'ref': {
                                'table': 'table_ref',
                                'field': 'table_ref_id'
                            }
                        },
                        'b_field': {
                            'name': 'b_field',
                            'type': 'id',
                            'regex': '^[0-9]{10}$'
                        },
                        'c_field': {
                            'name': 'c_field',
                            'type': 'text',
                            'regex': '^[a-z]{10}$'
                        }
                    }
                })
        }

        sample_rows_mock = Mock()
        sample_rows_mock.return_value = {
            'table_ref_id': {
                'name': 'table_ref_id'
            }
        }

        # Run
        sampler_mock = Mock()
        sampler_mock.dn = data_navigator_mock
        sampler_mock.sample_rows = sample_rows_mock

        row = pd.DataFrame({'c_field': ['foo', 'bar', 'tar']})
        labels = ['a_field', 'b_field', 'c_field']
        table_name = 'DEMO'

        Sampler._fill_text_columns(sampler_mock, row, labels, table_name)

        # Asserts
        sample_rows_mock.assert_called_once_with('table_ref', 1)
示例#27
0
    def test__find_parent_id_all_singlar_matrix(self, choice_mock):
        """If all likelihoods got singular matrix, use num_rows."""
        likelihoods = pd.Series([None, None, None, None])
        num_rows = pd.Series([1, 2, 3, 4])

        Sampler._find_parent_id(likelihoods, num_rows)

        expected_weights = np.array([1 / 10, 2 / 10, 3 / 10, 4 / 10])

        assert choice_mock.call_count == 1
        assert list(choice_mock.call_args[0][0]) == list(likelihoods.index)
        np.testing.assert_array_equal(choice_mock.call_args[1]['p'],
                                      expected_weights)
示例#28
0
    def test__find_parent_id_all_0_or_singlar_matrix(self, choice_mock):
        """If likehoods are either 0 or NaN, fill the gaps with num_rows."""
        likelihoods = pd.Series([0, None, 0, None])
        num_rows = pd.Series([1, 2, 3, 4])

        Sampler._find_parent_id(likelihoods, num_rows)

        expected_weights = np.array([0, 2 / 6, 0, 4 / 6])

        assert choice_mock.call_count == 1
        assert list(choice_mock.call_args[0][0]) == list(likelihoods.index)
        np.testing.assert_array_equal(choice_mock.call_args[1]['p'],
                                      expected_weights)
示例#29
0
    def test__find_parent_id_some_good(self, choice_mock):
        """If some likehoods are good, fill the gaps with num_rows."""
        likelihoods = pd.Series([0.5, None, 1.5, None])
        num_rows = pd.Series([1, 2, 3, 4])

        Sampler._find_parent_id(likelihoods, num_rows)

        expected_weights = np.array([0.5 / 4, 1 / 4, 1.5 / 4, 1 / 4])

        assert choice_mock.call_count == 1
        assert list(choice_mock.call_args[0][0]) == list(likelihoods.index)
        np.testing.assert_array_equal(choice_mock.call_args[1]['p'],
                                      expected_weights)
示例#30
0
    def test__find_parent_id_all_good(self, choice_mock):
        """If all are good, use the likelihoods unmodified."""
        likelihoods = pd.Series([0.5, 1, 1.5, 2])
        num_rows = pd.Series([1, 2, 3, 4])

        Sampler._find_parent_id(likelihoods, num_rows)

        expected_weights = np.array([0.5 / 5, 1 / 5, 1.5 / 5, 2 / 5])

        assert choice_mock.call_count == 1
        assert list(choice_mock.call_args[0][0]) == list(likelihoods.index)
        np.testing.assert_array_equal(choice_mock.call_args[1]['p'],
                                      expected_weights)