def __init__(self, field_names=None, field_types=None, field_transformers=None, anonymize_fields=None, primary_key=None, constraints=None, table_metadata=None): if table_metadata is None: self._metadata = Table( field_names=field_names, primary_key=primary_key, field_types=field_types, anonymize_fields=anonymize_fields, constraints=constraints, dtype_transformers=self._DTYPE_TRANSFORMERS, ) self._metadata_fitted = False else: for arg in (field_names, primary_key, field_types, anonymize_fields, constraints): if arg: raise ValueError( 'If table_metadata is given {} must be None'.format( arg.__name__)) if isinstance(table_metadata, dict): table_metadata = Table.from_dict(table_metadata) self._metadata = table_metadata self._metadata_fitted = table_metadata.fitted
def __init__(self, field_names=None, field_types=None, anonymize_fields=None, primary_key=None, entity_columns=None, context_columns=None, sequence_index=None, segment_size=None, context_model=None, table_metadata=None): if table_metadata is None: self._metadata = Table( field_names=field_names, primary_key=primary_key, field_types=field_types, anonymize_fields=anonymize_fields, dtype_transformers=self._DTYPE_TRANSFORMERS, sequence_index=sequence_index, entity_columns=entity_columns, context_columns=context_columns, ) self._metadata_fitted = False else: null_args = ( field_names, primary_key, field_types, anonymize_fields, sequence_index, entity_columns, context_columns ) for arg in null_args: if arg: raise ValueError( 'If table_metadata is given {} must be None'.format(arg.__name__)) if isinstance(table_metadata, dict): table_metadata = Table.from_dict( table_metadata, dtype_transformers=self._DTYPE_TRANSFORMERS, ) self._metadata = table_metadata self._metadata_fitted = table_metadata.fitted # Validate arguments if segment_size is not None and not isinstance(segment_size, int): if sequence_index is None: raise TypeError( '`segment_size` must be of type `int` if ' 'no `sequence_index` is given.' ) segment_size = pd.to_timedelta(segment_size) self._context_columns = self._metadata._context_columns self._entity_columns = self._metadata._entity_columns self._sequence_index = self._metadata._sequence_index self._segment_size = segment_size context_model = context_model or 'gaussian_copula' if isinstance(context_model, str): context_model = self._CONTEXT_MODELS[context_model] self._context_model_template = context_model
def test_from_dict_min_max(self): """Test the ``Table.from_dict`` method. Expect that when min_value and max_value are not provided, they are set to 'auto'. Input: - A dictionary representing a table's metadata Output: - A Table object """ # Setup metadata_dict = { 'fields': { 'item 0': {'type': 'id', 'subtype': 'integer'}, 'item 1': {'type': 'boolean'} }, 'primary_key': 'item 0' } # Run metadata = Table.from_dict(metadata_dict) # Assert assert metadata._transformer_templates['integer'].max_value == 'auto' assert metadata._transformer_templates['integer'].min_value == 'auto' assert metadata._transformer_templates['integer'].rounding == 'auto' assert metadata._transformer_templates['float'].max_value == 'auto' assert metadata._transformer_templates['float'].min_value == 'auto' assert metadata._transformer_templates['float'].rounding == 'auto'
def __init__(self, field_names=None, field_types=None, field_transformers=None, anonymize_fields=None, primary_key=None, constraints=None, table_metadata=None, field_distributions=None, default_distribution=None, categorical_transformer=None): if isinstance(table_metadata, dict): table_metadata = Table.from_dict(table_metadata) if table_metadata: model_kwargs = table_metadata.get_model_kwargs( self.__class__.__name__) if model_kwargs: if field_distributions is None: field_distributions = model_kwargs['field_distributions'] if default_distribution is None: default_distribution = model_kwargs['default_distribution'] if categorical_transformer is None: categorical_transformer = model_kwargs[ 'categorical_transformer'] if field_distributions and not isinstance(field_distributions, dict): raise TypeError( 'field_distributions can only be None or a dict instance') self._field_distributions = { field: self._validate_distribution(distribution) for field, distribution in (field_distributions or {}).items() } self._default_distribution = ( self._validate_distribution(default_distribution) or self._DEFAULT_DISTRIBUTION) self._categorical_transformer = categorical_transformer or self._DEFAULT_TRANSFORMER self._DTYPE_TRANSFORMERS = {'O': self._categorical_transformer} super().__init__( field_names=field_names, field_types=field_types, field_transformers=field_transformers, anonymize_fields=anonymize_fields, primary_key=primary_key, constraints=constraints, table_metadata=table_metadata, ) self._metadata.set_model_kwargs( self.__class__.__name__, { 'field_distributions': field_distributions, 'default_distribution': default_distribution, 'categorical_transformer': categorical_transformer, })
def test__get_faker_specified_locales_list(self): """Test that ``_get_faker`` with locales parameter sets localization correctly. The ``_get_faker`` should return a Faker object localized to the specified locales. Input: - Field metadata from metadata dict. Output: - Faker object with specified list of localizations. """ # Setup metadata_dict = { 'fields': { 'foo': { 'type': 'categorical', 'pii': True, 'pii_category': 'company', 'pii_locales': ['en_US', 'sv_SE'] } } } # Run faker = Table.from_dict(metadata_dict)._get_faker(metadata_dict['fields']['foo']) # Assert assert isinstance(faker, Faker) assert faker.locales == ['en_US', 'sv_SE']
def test__get_faker_default_locale(self): """Test that ``_get_faker`` without locales parameter has default locale. The ``_get_faker`` should return a Faker object localized to the default locale. When no locales are specified explicitly. Input: - Field metadata from metadata dict. Output: - Faker object with default localization. """ # Setup metadata_dict = { 'fields': { 'foo': { 'type': 'categorical', 'pii': True, 'pii_category': 'company' } } } # Run faker = Table.from_dict(metadata_dict)._get_faker(metadata_dict['fields']['foo']) # Assert assert isinstance(faker, Faker) assert faker.locales == [DEFAULT_LOCALE]
def test__get_faker_method_pass_args(self): """Test that ``_get_faker_method`` method utilizes parameters passed in category argument. The ``_get_faker_method`` method uses the parameters passed to it in the category argument. Input: - Faker object to create faked values with. - Category tuple of category name and parameters passed to the method creating fake values. Output: - Fake values created with the specified method from the Faker object. Utilizing the arguments given to it. """ # Setup metadata_dict = { 'fields': { 'foo': { 'type': 'categorical', 'pii': True, 'pii_category': 'ean' } } } metadata = Table.from_dict(metadata_dict) # Run fake_8_ean = metadata._get_faker_method(Faker(), ('ean', 8)) ean_8 = fake_8_ean() fake_13_ean = metadata._get_faker_method(Faker(), ('ean', 13)) ean_13 = fake_13_ean() # Assert assert len(ean_8) == 8 assert len(ean_13) == 13
def test__make_ids_unique_field_index_out_of_order(self): """Test that updated id column is unique even if index is out of order.""" metadata_dict = { 'fields': { 'item 0': { 'type': 'id', 'subtype': 'integer' }, 'item 1': { 'type': 'boolean' } }, 'primary_key': 'item 0' } metadata = Table.from_dict(metadata_dict) data = pd.DataFrame( { 'item 0': [0, 1, 1, 2, 3, 5, 5, 6], 'item 1': [True, True, False, False, True, False, False, True] }, index=[0, 1, 1, 2, 3, 5, 5, 6]) new_data = metadata.make_ids_unique(data) assert new_data['item 1'].equals(data['item 1']) assert new_data['item 0'].is_unique
def __init__(self, field_names=None, field_types=None, field_transformers=None, anonymize_fields=None, primary_key=None, constraints=None, table_metadata=None, distribution=None, default_distribution=None, categorical_transformer=None): if isinstance(table_metadata, dict): table_metadata = Table.from_dict(table_metadata) if table_metadata: model_kwargs = table_metadata.get_model_kwargs(self.__class__.__name__) if model_kwargs: if distribution is None: distribution = model_kwargs['distribution'] if categorical_transformer is None: categorical_transformer = model_kwargs['categorical_transformer'] self._distribution = distribution self._default_distribution = default_distribution or 'parametric' categorical_transformer = categorical_transformer or self._DEFAULT_TRANSFORMER self._categorical_transformer = categorical_transformer self._DTYPE_TRANSFORMERS = {'O': categorical_transformer} super().__init__( field_names=field_names, primary_key=primary_key, field_types=field_types, anonymize_fields=anonymize_fields, constraints=constraints, table_metadata=table_metadata )
def test_make_ids_unique_field_already_unique(self): """Test that id column is kept if already unique.""" metadata_dict = { 'fields': { 'item 0': {'type': 'id', 'subtype': 'integer'}, 'item 1': {'type': 'boolean'} }, 'primary_key': 'item 0' } metadata = Table.from_dict(metadata_dict) data = pd.DataFrame({ 'item 0': [9, 1, 8, 2, 3, 7, 5, 6], 'item 1': [True, True, False, False, True, False, False, True] }) new_data = metadata.make_ids_unique(data) assert new_data['item 1'].equals(data['item 1']) assert new_data['item 0'].equals(data['item 0'])
def test_make_ids_unique_field_not_unique(self): """Test that id column is replaced with all unique values if not already unique.""" metadata_dict = { 'fields': { 'item 0': {'type': 'id', 'subtype': 'integer'}, 'item 1': {'type': 'boolean'} }, 'primary_key': 'item 0' } metadata = Table.from_dict(metadata_dict) data = pd.DataFrame({ 'item 0': [0, 1, 1, 2, 3, 5, 5, 6], 'item 1': [True, True, False, False, True, False, False, True] }) new_data = metadata.make_ids_unique(data) assert new_data['item 1'].equals(data['item 1']) assert new_data['item 0'].is_unique
def __init__(self, field_names=None, field_types=None, field_transformers=None, anonymize_fields=None, primary_key=None, constraints=None, table_metadata=None, rounding='auto', min_value='auto', max_value='auto'): if table_metadata is None: self._metadata = Table(field_names=field_names, primary_key=primary_key, field_types=field_types, field_transformers=field_transformers, anonymize_fields=anonymize_fields, constraints=constraints, dtype_transformers=self._DTYPE_TRANSFORMERS, rounding=rounding, min_value=min_value, max_value=max_value) self._metadata_fitted = False else: table_metadata = deepcopy(table_metadata) for arg in (field_names, primary_key, field_types, anonymize_fields, constraints): if arg: raise ValueError( 'If table_metadata is given {} must be None'.format( arg.__name__)) if isinstance(table_metadata, dict): table_metadata = Table.from_dict(table_metadata) table_metadata._dtype_transformers.update(self._DTYPE_TRANSFORMERS) self._metadata = table_metadata self._metadata_fitted = table_metadata.fitted
def load_tabular_demo(dataset_name=None, table_name=None, data_path=DATA_PATH, metadata=False): """Load a tabular demo. If a dataset name is given, it is downloaded from the sdv-datasets S3 bucket. Otherwise, a toy dataset with a single table that contains data from a short fake collection of employees. If ``metadata`` is ``True``, the output will be a tuple with a ``Metadata`` instance for the dataset and a ``pandas.DataFrame`` with the data from the table. If ``metadata`` is ``False``, only the ``pandas.DataFrame`` is returned. Args: dataset_name (str): Dataset name to be downloaded, if ``None`` use default demo data. Defaults to ``None``. table_name (str): If a table name is given, return this table from the indicated dataset. Otherwise, return the first one. data_path (str): Data path to save the dataset files, only used if dataset_name is provided. Defaults to ``DATA_PATH``. metadata (bool): If ``True`` also return a Table object. Defaults to ``False``. Returns: pandas.DataFrame or tuple: If ``metadata`` is ``False`` return a ``pandas.DataFrame`` with the tables data. If ``metadata`` is ``True`` return a ``tuple`` with a Table and the data. """ if dataset_name: meta, tables = _load_demo_dataset(dataset_name, data_path) if table_name is None: table_name = meta.get_tables()[0] table = _dtypes64(tables[table_name]) if metadata: return Table.from_dict(meta.get_table_meta(table_name)), table return table table = _dtypes64(_load_tabular_dummy()) if metadata: table_meta = Table.from_dict({ 'fields': { 'company': { 'type': 'categorical' }, 'department': { 'type': 'categorical' }, 'name': { 'type': 'categorical' }, 'address': { 'type': 'categorical' }, 'age': { 'type': 'numerical', 'subtype': 'integer' }, 'age_when_joined': { 'type': 'numerical', 'subtype': 'integer' }, 'years_in_the_company': { 'type': 'numerical', 'subtype': 'integer' } }, 'constraints': [{ 'constraint': 'UniqueCombinations', 'columns': ['company', 'department'], }, { 'constraint': 'GreaterThan', 'low': 'age_when_joined', 'high': 'age' }], 'model_kwargs': {} }) return table_meta, table return table
def test_table(): """Test Table with pii and non-pii columns.""" num_values = 100 metadata_dict = { 'fields': { 'years_employed': { 'type': 'numerical', 'subtype': 'integer', }, 'ssn': { 'type': 'categorical', 'pii': True, 'pii_category': 'ssn', }, 'company_US': { 'type': 'categorical', 'pii': True, 'pii_category': 'company', 'pii_locales': ['en_US'], }, 'company_US_CN': { 'type': 'categorical', 'pii': True, 'pii_category': 'company', 'pii_locales': ['en_US', 'zh_CN'], }, }, } data = pd.DataFrame({ 'years_employed': np.random.choice(20, num_values), 'ssn': [str(i) for i in range(num_values)], 'company_US': [str(i) for i in range(num_values)], 'company_US_CN': [str(i) for i in range(num_values)], }) metadata = Table.from_dict(metadata_dict) metadata.fit(data) transformed = metadata.transform(data) assert transformed.dtypes.isin([np.dtype('int32'), np.dtype('int64')]).all() reverse_transformed = metadata.reverse_transform(transformed) sampled_years_employed = reverse_transformed['years_employed'] assert sampled_years_employed.dtype == 'int' assert ((sampled_years_employed >= 0) & (sampled_years_employed < 20)).all() sampled_ssn = reverse_transformed['ssn'] ssn_regex = re.compile(r'^\d\d\d-\d\d-\d\d\d\d$') assert sampled_ssn.dtype == 'object' assert sampled_ssn.str.match(ssn_regex).all() sampled_company_US = reverse_transformed['company_US'] assert sampled_company_US.dtype == 'object' # Check that all companies are sampled from the `en_US` locale assert ((sampled_company_US > u'\u0000') & (sampled_company_US < u'\u007F')).all() sampled_company_US_CN = reverse_transformed['company_US_CN'] assert sampled_company_US_CN.dtype == 'object' # Check that we have sampled companies from the `en_US` locale assert ((sampled_company_US_CN > u'\u0000') & (sampled_company_US_CN < u'\u007F')).any() # Check that we have sampled companies from the `zh_CH` locale assert ((sampled_company_US_CN > u'\u4e00') & (sampled_company_US_CN < u'\u9fff')).any()