Пример #1
0
    def fit(self, metadata, tables=None, root_path=None):
        """Fit this SDV instance to the dataset data.

        Args:
            metadata (dict, str or Metadata):
                Metadata dict, path to the metadata JSON file or Metadata instance itself.
            tables (dict):
                Dictionary with the table names as key and ``pandas.DataFrame`` instances as
                values.  If ``None`` is given, the tables will be loaded from the paths
                indicated in ``metadata``. Defaults to ``None``.
            root_path (str or None):
                Path to the dataset directory. If ``None`` and metadata is
                a path, the metadata location is used. If ``None`` and
                metadata is a dict, the current working directory is used.
        """

        if isinstance(metadata, Metadata):
            self.metadata = metadata
        else:
            self.metadata = Metadata(metadata, root_path)

        self.metadata.validate(tables)

        self.modeler = Modeler(self.metadata, self.model, self.model_kwargs)
        self.modeler.model_database(tables)
        self.sampler = Sampler(self.metadata, self.modeler.models, self.model,
                               self.model_kwargs)
Пример #2
0
    def test__visualize_add_edges(self):
        """Add edges into a graphviz digraph."""
        # Setup
        metadata = MagicMock(spec_set=Metadata)

        metadata.get_tables.return_value = ['demo', 'other']
        metadata.get_parents.side_effect = [set(['other']), set()]

        metadata.get_foreign_key.return_value = 'fk'
        metadata.get_primary_key.return_value = 'pk'

        plot = Mock()

        # Run
        Metadata._visualize_add_edges(metadata, plot)

        # Asserts
        expected_edge_label = '   {}.{} -> {}.{}'.format('demo', 'fk', 'other', 'pk')

        metadata.get_tables.assert_called_once_with()
        metadata.get_foreign_key.assert_called_once_with('other', 'demo')
        metadata.get_primary_key.assert_called_once_with('other')
        assert metadata.get_parents.call_args_list == [call('demo'), call('other')]

        plot.edge.assert_called_once_with(
            'other',
            'demo',
            label=expected_edge_label,
            arrowhead='crow'
        )
Пример #3
0
    def test_add_table_with_no_fields_data(self):
        """Add table with data to analyze all"""
        # Setup
        metadata = Mock(spec=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']
        metadata._metadata = {'tables': dict()}
        metadata._get_field_details.return_value = {
            'a_field': {'type': 'numerical', 'subtype': 'integer'},
            'b_field': {'type': 'boolean'},
            'c_field': {'type': 'categorical'}
        }

        # Run
        data = pd.DataFrame({'a_field': [0, 1], 'b_field': [True, False], 'c_field': ['a', 'b']})

        Metadata.add_table(metadata, 'x_table', data=data)

        # Asserts
        expected_table_meta = {
            'fields': {
                'a_field': {'type': 'numerical', 'subtype': 'integer'},
                'b_field': {'type': 'boolean'},
                'c_field': {'type': 'categorical'}
            }
        }

        assert metadata._metadata['tables']['x_table'] == expected_table_meta

        metadata.set_primary_key.call_count == 0
        metadata.add_relationship.call_count == 0
Пример #4
0
    def test_add_table_with_data_str(self, mock_read_csv):
        """Add table with data as str"""
        # Setup
        metadata = Mock(spec_set=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']
        metadata._metadata = {'tables': dict()}
        mock_read_csv.return_value = pd.DataFrame({
            'a_field': [0, 1],
            'b_field': [True, False],
            'c_field': ['a', 'b']
        })
        metadata._get_field_details.return_value = {
            'a_field': {'type': 'numerical', 'subtype': 'integer'},
            'b_field': {'type': 'boolean'},
            'c_field': {'type': 'categorical'}
        }

        # Run
        Metadata.add_table(metadata, 'x_table', data='/path/to/file.csv')

        expected_table_meta = {
            'fields': {
                'a_field': {'type': 'numerical', 'subtype': 'integer'},
                'b_field': {'type': 'boolean'},
                'c_field': {'type': 'categorical'}
            },
            'path': '/path/to/file.csv'
        }

        assert metadata._metadata['tables']['x_table'] == expected_table_meta

        metadata.set_primary_key.call_count == 0
        metadata.add_relationship.call_count == 0
Пример #5
0
    def test_add_table_with_fields_metadata(self):
        """Add table with fields metadata"""
        # Setup
        metadata = Mock(spec=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']
        metadata._metadata = {'tables': dict()}

        # Run
        fields_metadata = {
            'a_field': {'type': 'numerical', 'subtype': 'integer'}
        }

        Metadata.add_table(metadata, 'x_table', fields_metadata=fields_metadata)

        # Asserts
        expected_table_meta = {
            'fields': {
                'a_field': {'type': 'numerical', 'subtype': 'integer'}
            }
        }

        assert metadata._metadata['tables']['x_table'] == expected_table_meta

        metadata.set_primary_key.call_count == 0
        metadata.add_relationship.call_count == 0
Пример #6
0
    def test_add_field(self):
        """Add field table no exist"""
        # Setup
        metadata = Mock(spec=Metadata)
        metadata.get_tables.return_value = list()
        metadata._metadata = {
            'tables': {
                'a_table': {'fields': dict()}
            }
        }

        # Run
        Metadata.add_field(metadata, 'a_table', 'a_field', 'id', 'string', None)

        # Asserts
        expected_metadata = {
            'tables': {
                'a_table': {
                    'fields': {'a_field': {'type': 'id', 'subtype': 'string'}}
                }
            }
        }

        assert metadata._metadata == expected_metadata
        metadata._check_field.assert_called_once_with('a_table', 'a_field', exists=False)
Пример #7
0
    def test_reverse_transform(self):
        """Test reverse transform"""
        # Setup
        ht_mock = Mock()
        ht_mock.reverse_transform.return_value = {
            'item 1': pd.Series([1.0, 2.0, None, 4.0, 5.0]),
            'item 2': pd.Series([1.1, None, 3.3, None, 5.5]),
            'item 3': pd.Series([None, 'bbb', 'ccc', 'ddd', None]),
            'item 4': pd.Series([True, False, None, False, True])
        }

        metadata = Mock(spec=Metadata)
        metadata._hyper_transformers = {
            'test': ht_mock
        }
        metadata._get_dtypes.return_value = {
            'item 1': int,
            'item 2': float,
            'item 3': np.object,
            'item 4': bool,
        }

        # Run
        data = pd.DataFrame({'foo': [0, 1]})
        Metadata.reverse_transform(metadata, 'test', data)

        # Asserts
        expected_call = pd.DataFrame({'foo': [0, 1]})
        pd.testing.assert_frame_equal(
            ht_mock.reverse_transform.call_args[0][0],
            expected_call
        )
Пример #8
0
 def test__get_transformers_raise_valueerror(self):
     """Test get transformers dict raise ValueError."""
     # Run
     dtypes = {
         'string': str
     }
     with pytest.raises(ValueError):
         Metadata._get_transformers(dtypes, None)
Пример #9
0
def _load_demo_dataset(dataset_name, data_path):
    dataset_path = _get_dataset_path(dataset_name, data_path)
    meta = Metadata(metadata=os.path.join(dataset_path, 'metadata.json'))
    tables = {
        name: _dtypes64(table)
        for name, table in meta.load_tables().items()
    }
    return meta, tables
Пример #10
0
    def __init__(self, metadata, root_path=None):
        if isinstance(metadata, Metadata):
            self.metadata = metadata
        else:
            self.metadata = Metadata(metadata, root_path)

        self._primary_key_generators = dict()
        self._remaining_primary_keys = dict()
Пример #11
0
    def test_add_table_already_exist(self):
        """Try to add a new table that already exist"""
        # Setup
        metadata = Mock(spec=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']

        # Run
        with pytest.raises(ValueError):
            Metadata.add_table(metadata, 'a_table')
Пример #12
0
    def test_add_relationship_parent_no_exist(self):
        """Add relationship table no exist"""
        # Setup
        metadata = Mock(spec=Metadata)
        metadata.get_tables.return_value = ['a_table']

        # Run
        with pytest.raises(ValueError):
            Metadata.add_relationship(metadata, 'a_table', 'b_table')
Пример #13
0
    def test_add_relationship_already_exist(self):
        """Add relationship already exist"""
        # Setup
        metadata = Mock(spec=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']
        metadata.get_parents.return_value = set(['b_table'])

        # Run
        with pytest.raises(ValueError):
            Metadata.add_relationship(metadata, 'a_table', 'b_table')
Пример #14
0
    def test_get_dtypes_error_id(self):
        """Test get data types with an id that is not a primary or foreign key."""
        # Setup
        table_meta = {'fields': {'item': {'type': 'id'}}}
        metadata = Mock(spec=Metadata)
        metadata.get_table_meta.return_value = table_meta
        metadata._DTYPES = Metadata._DTYPES

        # Run
        with pytest.raises(ValueError):
            Metadata.get_dtypes(metadata, 'test', ids=True)
Пример #15
0
    def test_get_dtypes_error_subtype_id(self):
        """Test get data types with an invalid id subtype."""
        # Setup
        table_meta = {'fields': {'item': {'type': 'id', 'subtype': 'boolean'}}}
        metadata = Mock(spec=Metadata)
        metadata.get_table_meta.return_value = table_meta
        metadata._DTYPES = Metadata._DTYPES

        # Run
        with pytest.raises(ValueError):
            Metadata.get_dtypes(metadata, 'test', ids=True)
Пример #16
0
    def test_get_dtypes_error_invalid_type(self):
        """Test get data types with an invalid type."""
        # Setup
        table_meta = {'fields': {'item': {'type': 'unknown'}}}
        metadata = Mock(spec=Metadata)
        metadata.get_table_meta.return_value = table_meta
        metadata._DTYPES = Metadata._DTYPES

        # Run
        with pytest.raises(ValueError):
            Metadata.get_dtypes(metadata, 'test')
Пример #17
0
    def test_add_relationship_parent_no_primary_key(self):
        """Add relationship parent no primary key"""
        # Setup
        metadata = Mock(spec=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']
        metadata.get_parents.return_value = set()
        metadata.get_children.return_value = set()
        metadata.get_primary_key.return_value = None

        # Run
        with pytest.raises(ValueError):
            Metadata.add_relationship(metadata, 'a_table', 'b_table')
Пример #18
0
def _tabular_metric(sdmetric, synthetic, real, metadata=None, details=False):
    if metadata is None:
        metadata = Metadata()
        metadata.add_table(None, real)
        real = {None: real}
        synthetic = {None: synthetic}

    metrics = sdmetric.metrics(metadata, real, synthetic)
    if details:
        return list(metrics)

    return np.mean([metric.value for metric in metrics])
Пример #19
0
    def test_get_dtypes_error_subtype_numerical(self):
        """Test get data types with an invalid numerical subtype."""
        # Setup
        table_meta = {
            'fields': {
                'item': {'type': 'numerical', 'subtype': 'boolean'}
            }
        }
        metadata = Mock(spec_set=Metadata)
        metadata.get_table_meta.return_value = table_meta
        metadata._DTYPES = Metadata._DTYPES

        # Run
        with pytest.raises(MetadataError):
            Metadata.get_dtypes(metadata, 'test')
Пример #20
0
    def test__get_dtypes_error_subtype_numerical(self):
        """Test get data types with an invalid numerical subtype."""
        # Setup
        table_meta = {
            'fields': {
                'item': {'type': 'numerical', 'subtype': 'boolean'}
            }
        }

        # Run and asserts
        metadata = Mock(spec=Metadata)
        metadata.get_table_meta.return_value = table_meta

        with pytest.raises(ValueError):
            Metadata._get_dtypes(metadata, 'test')
Пример #21
0
def _load_relational_dummy():
    users = pd.DataFrame({
        'user_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        'country': ['US', 'UK', 'ES', 'UK', 'US', 'DE', 'BG', 'ES', 'FR', 'UK'],
        'gender': ['M', 'F', None, 'M', 'F', 'M', 'F', None, 'F', None],
        'age': [34, 23, 44, 22, 54, 57, 45, 41, 23, 30]
    })
    sessions = pd.DataFrame({
        'session_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        'user_id': [0, 1, 1, 2, 4, 5, 6, 6, 6, 8],
        'device': ['mobile', 'tablet', 'tablet', 'mobile', 'mobile',
                   'mobile', 'mobile', 'tablet', 'mobile', 'tablet'],
        'os': ['android', 'ios', 'android', 'android', 'ios',
               'android', 'ios', 'ios', 'ios', 'ios']
    })
    transactions = pd.DataFrame({
        'transaction_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        'session_id': [0, 0, 1, 3, 5, 5, 7, 8, 9, 9],
        'timestamp': ['2019-01-01T12:34:32', '2019-01-01T12:42:21', '2019-01-07T17:23:11',
                      '2019-01-10T11:08:57', '2019-01-10T21:54:08', '2019-01-11T11:21:20',
                      '2019-01-22T14:44:10', '2019-01-23T10:14:09', '2019-01-27T16:09:17',
                      '2019-01-29T12:10:48'],
        'amount': [100.0, 55.3, 79.5, 112.1, 110.0, 76.3, 89.5, 132.1, 68.0, 99.9],
        'approved': [True, True, True, False, False, True, True, False, True, True],
    })
    transactions['timestamp'] = pd.to_datetime(transactions['timestamp'])

    tables = {
        'users': users,
        'sessions': sessions,
        'transactions': transactions
    }

    return Metadata(DEMO_METADATA), tables
Пример #22
0
    def test__get_graphviz_extension_none(self):
        """Get graphviz with path equals to None."""
        # Run
        result = Metadata._get_graphviz_extension(None)

        # Asserts
        assert result == (None, None)
Пример #23
0
    def test_get_dtypes_with_ids(self):
        """Test get data types including ids."""
        # Setup
        table_meta = {
            'fields': {
                'item 0': {'type': 'id', 'subtype': 'integer'},
                'item 1': {'type': 'numerical', 'subtype': 'integer'},
                'item 2': {'type': 'numerical', 'subtype': 'float'},
                'item 3': {'type': 'categorical'},
                'item 4': {'type': 'boolean'},
                'item 5': {'type': 'datetime'}
            },
            'primary_key': 'item 0'
        }
        metadata = Mock(spec_set=Metadata)
        metadata.get_table_meta.return_value = table_meta
        metadata._DTYPES = Metadata._DTYPES

        # Run
        result = Metadata.get_dtypes(metadata, 'test', ids=True)

        # Asserts
        expected = {
            'item 0': 'int',
            'item 1': 'int',
            'item 2': 'float',
            'item 3': 'object',
            'item 4': 'bool',
            'item 5': 'datetime64',
        }
        assert result == expected
Пример #24
0
    def test_add_table_with_fields_no_data(self):
        """Add table with fields and no data"""
        # Setup
        metadata = Mock(spec=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']
        metadata._metadata = {'tables': dict()}

        # Run
        fields = ['a_field', 'b_field']

        Metadata.add_table(metadata, 'x_table', fields=fields)

        # Asserts
        expected_table_meta = {'fields': dict()}

        assert metadata._metadata['tables']['x_table'] == expected_table_meta
Пример #25
0
    def test__get_graphviz_extension_valid(self):
        """Get a valid graphviz extension."""
        # Run
        result = Metadata._get_graphviz_extension('/some/path.png')

        # Asserts
        assert result == ('/some/path', 'png')
Пример #26
0
    def test__dict_metadata(self):
        """Test dict_metadata"""
        # Run
        metadata = {
            'tables': [{
                'name': 'test',
                'use': True,
                'fields': [{
                    'ref': {'table': 'table_ref', 'field': 'field_ref'},
                    'name': 'test_field'
                }]
            }]
        }

        result = Metadata._dict_metadata(metadata)

        # Asserts
        expected = {
            'tables': {
                'test': {
                    'use': True,
                    'name': 'test',
                    'fields': {
                        'test_field': {
                            'ref': {'table': 'table_ref', 'field': 'field_ref'},
                            'name': 'test_field'
                        }
                    }
                }
            }
        }

        assert result == expected
Пример #27
0
    def test__get_pii_fields(self):
        """Test get pii fields"""
        # Setup
        table_meta = {
            'fields': {
                'foo': {
                    'type': 'categorical',
                    'pii': True,
                    'pii_category': 'email'
                },
                'bar': {
                    'type': 'categorical',
                    'pii_category': 'email'
                }
            }
        }

        # Run
        metadata = Mock(spec=Metadata)
        metadata.get_table_meta.return_value = table_meta

        table_name = 'test'

        result = Metadata._get_pii_fields(metadata, table_name)

        # Asserts
        expected = {'foo': 'email'}

        assert result == expected
Пример #28
0
    def test__get_dtypes_no_ids(self):
        """Test get data types excluding ids."""
        # Setup
        table_meta = {
            'fields': {
                'item 0': {'type': 'id', 'subtype': 'integer'},
                'item 1': {'type': 'numerical', 'subtype': 'integer'},
                'item 2': {'type': 'numerical', 'subtype': 'float'},
                'item 3': {'type': 'categorical'},
                'item 4': {'type': 'boolean'},
                'item 5': {'type': 'datetime'},
            }
        }
        metadata = Mock(spec=Metadata)
        metadata.get_table_meta.return_value = table_meta
        metadata._DTYPES = Metadata._DTYPES

        # Run
        result = Metadata._get_dtypes(metadata, 'test')

        # Asserts
        expected = {
            'item 1': int,
            'item 2': float,
            'item 3': np.object,
            'item 4': bool,
            'item 5': np.datetime64,
        }
        assert result == expected
Пример #29
0
    def test_load_tables(self):
        """Test get tables"""
        # Setup
        table_names = ['foo', 'bar', 'tar']
        table_data = [
            pd.DataFrame({'foo': [1, 2]}),
            pd.DataFrame({'bar': [3, 4]}),
            pd.DataFrame({'tar': [5, 6]})
        ]
        metadata = Mock(spec=Metadata)
        metadata.get_tables.side_effect = table_names
        metadata.load_table.side_effect = table_data

        # Run
        tables = ['table 1', 'table 2', 'table 3']
        result = Metadata.load_tables(metadata, tables=tables)

        # Asserts
        expected = {
            'table 1': pd.DataFrame({'foo': [1, 2]}),
            'table 2': pd.DataFrame({'bar': [3, 4]}),
            'table 3': pd.DataFrame({'tar': [5, 6]})
        }
        assert result.keys() == expected.keys()

        for k, v in result.items():
            pd.testing.assert_frame_equal(v, expected[k])
Пример #30
0
    def test_get_foreign_key(self):
        """Test get foreign key"""
        # Setup
        primary_key = 'a_primary_key'
        fields = {
            'a_field': {
                'ref': {
                    'field': 'a_primary_key'
                },
                'name': 'a_field'
            },
            'p_field': {
                'ref': {
                    'field': 'another_key_field'
                },
                'name': 'p_field'
            }
        }
        metadata = Mock(spec=Metadata)
        metadata.get_primary_key.return_value = primary_key
        metadata.get_fields.return_value = fields

        # Run
        result = Metadata.get_foreign_key(metadata, 'parent', 'child')

        # Asserts
        assert result == 'a_field'
        metadata.get_primary_key.assert_called_once_with('parent')
        metadata.get_fields.assert_called_once_with('child')