예제 #1
0
    def __init__(self,
                 field_names=None,
                 field_types=None,
                 field_transformers=None,
                 anonymize_fields=None,
                 primary_key=None,
                 constraints=None,
                 table_metadata=None):
        if table_metadata is None:
            self._metadata = Table(
                field_names=field_names,
                primary_key=primary_key,
                field_types=field_types,
                anonymize_fields=anonymize_fields,
                constraints=constraints,
                dtype_transformers=self._DTYPE_TRANSFORMERS,
            )
            self._metadata_fitted = False
        else:
            for arg in (field_names, primary_key, field_types,
                        anonymize_fields, constraints):
                if arg:
                    raise ValueError(
                        'If table_metadata is given {} must be None'.format(
                            arg.__name__))

            if isinstance(table_metadata, dict):
                table_metadata = Table.from_dict(table_metadata)

            self._metadata = table_metadata
            self._metadata_fitted = table_metadata.fitted
예제 #2
0
파일: base.py 프로젝트: sdv-dev/SDV
    def __init__(self, field_names=None, field_types=None, anonymize_fields=None,
                 primary_key=None, entity_columns=None, context_columns=None,
                 sequence_index=None, segment_size=None, context_model=None,
                 table_metadata=None):
        if table_metadata is None:
            self._metadata = Table(
                field_names=field_names,
                primary_key=primary_key,
                field_types=field_types,
                anonymize_fields=anonymize_fields,
                dtype_transformers=self._DTYPE_TRANSFORMERS,
                sequence_index=sequence_index,
                entity_columns=entity_columns,
                context_columns=context_columns,
            )
            self._metadata_fitted = False
        else:
            null_args = (
                field_names,
                primary_key,
                field_types,
                anonymize_fields,
                sequence_index,
                entity_columns,
                context_columns
            )
            for arg in null_args:
                if arg:
                    raise ValueError(
                        'If table_metadata is given {} must be None'.format(arg.__name__))

            if isinstance(table_metadata, dict):
                table_metadata = Table.from_dict(
                    table_metadata,
                    dtype_transformers=self._DTYPE_TRANSFORMERS,
                )

            self._metadata = table_metadata
            self._metadata_fitted = table_metadata.fitted

        # Validate arguments
        if segment_size is not None and not isinstance(segment_size, int):
            if sequence_index is None:
                raise TypeError(
                    '`segment_size` must be of type `int` if '
                    'no `sequence_index` is given.'
                )

            segment_size = pd.to_timedelta(segment_size)

        self._context_columns = self._metadata._context_columns
        self._entity_columns = self._metadata._entity_columns
        self._sequence_index = self._metadata._sequence_index
        self._segment_size = segment_size

        context_model = context_model or 'gaussian_copula'
        if isinstance(context_model, str):
            context_model = self._CONTEXT_MODELS[context_model]

        self._context_model_template = context_model
예제 #3
0
    def test_from_dict_min_max(self):
        """Test the ``Table.from_dict`` method.

        Expect that when min_value and max_value are not provided,
        they are set to 'auto'.

        Input:
        - A dictionary representing a table's metadata
        Output:
        - A Table object
        """
        # Setup
        metadata_dict = {
            'fields': {
                'item 0': {'type': 'id', 'subtype': 'integer'},
                'item 1': {'type': 'boolean'}
            },
            'primary_key': 'item 0'
        }

        # Run
        metadata = Table.from_dict(metadata_dict)

        # Assert
        assert metadata._transformer_templates['integer'].max_value == 'auto'
        assert metadata._transformer_templates['integer'].min_value == 'auto'
        assert metadata._transformer_templates['integer'].rounding == 'auto'
        assert metadata._transformer_templates['float'].max_value == 'auto'
        assert metadata._transformer_templates['float'].min_value == 'auto'
        assert metadata._transformer_templates['float'].rounding == 'auto'
예제 #4
0
파일: copulas.py 프로젝트: sammykol83/SDV
    def __init__(self,
                 field_names=None,
                 field_types=None,
                 field_transformers=None,
                 anonymize_fields=None,
                 primary_key=None,
                 constraints=None,
                 table_metadata=None,
                 field_distributions=None,
                 default_distribution=None,
                 categorical_transformer=None):

        if isinstance(table_metadata, dict):
            table_metadata = Table.from_dict(table_metadata)

        if table_metadata:
            model_kwargs = table_metadata.get_model_kwargs(
                self.__class__.__name__)
            if model_kwargs:
                if field_distributions is None:
                    field_distributions = model_kwargs['field_distributions']

                if default_distribution is None:
                    default_distribution = model_kwargs['default_distribution']

                if categorical_transformer is None:
                    categorical_transformer = model_kwargs[
                        'categorical_transformer']

        if field_distributions and not isinstance(field_distributions, dict):
            raise TypeError(
                'field_distributions can only be None or a dict instance')

        self._field_distributions = {
            field: self._validate_distribution(distribution)
            for field, distribution in (field_distributions or {}).items()
        }
        self._default_distribution = (
            self._validate_distribution(default_distribution)
            or self._DEFAULT_DISTRIBUTION)

        self._categorical_transformer = categorical_transformer or self._DEFAULT_TRANSFORMER
        self._DTYPE_TRANSFORMERS = {'O': self._categorical_transformer}

        super().__init__(
            field_names=field_names,
            field_types=field_types,
            field_transformers=field_transformers,
            anonymize_fields=anonymize_fields,
            primary_key=primary_key,
            constraints=constraints,
            table_metadata=table_metadata,
        )

        self._metadata.set_model_kwargs(
            self.__class__.__name__, {
                'field_distributions': field_distributions,
                'default_distribution': default_distribution,
                'categorical_transformer': categorical_transformer,
            })
예제 #5
0
    def test__get_faker_specified_locales_list(self):
        """Test that ``_get_faker`` with locales parameter sets localization correctly.

        The ``_get_faker`` should return a Faker object localized to the specified locales.

        Input:
        - Field metadata from metadata dict.
        Output:
        - Faker object with specified list of localizations.
        """
        # Setup
        metadata_dict = {
            'fields': {
                'foo': {
                    'type': 'categorical',
                    'pii': True,
                    'pii_category': 'company',
                    'pii_locales': ['en_US', 'sv_SE']
                }
            }
        }

        # Run
        faker = Table.from_dict(metadata_dict)._get_faker(metadata_dict['fields']['foo'])

        # Assert
        assert isinstance(faker, Faker)
        assert faker.locales == ['en_US', 'sv_SE']
예제 #6
0
    def test__get_faker_default_locale(self):
        """Test that ``_get_faker`` without locales parameter has default locale.

        The ``_get_faker`` should return a Faker object localized to the default locale.
        When no locales are specified explicitly.

        Input:
        - Field metadata from metadata dict.
        Output:
        - Faker object with default localization.
        """
        # Setup
        metadata_dict = {
            'fields': {
                'foo': {
                    'type': 'categorical',
                    'pii': True,
                    'pii_category': 'company'
                }
            }
        }

        # Run
        faker = Table.from_dict(metadata_dict)._get_faker(metadata_dict['fields']['foo'])

        # Assert
        assert isinstance(faker, Faker)
        assert faker.locales == [DEFAULT_LOCALE]
예제 #7
0
    def test__get_faker_method_pass_args(self):
        """Test that ``_get_faker_method`` method utilizes parameters passed in category argument.

        The ``_get_faker_method`` method uses the parameters passed to it in the category argument.

        Input:
        - Faker object to create faked values with.
        - Category tuple of category name and parameters passed to the method creating fake values.
        Output:
        - Fake values created with the specified method from the Faker object.
        Utilizing the arguments given to it.
        """
        # Setup
        metadata_dict = {
            'fields': {
                'foo': {
                    'type': 'categorical',
                    'pii': True,
                    'pii_category': 'ean'
                }
            }
        }
        metadata = Table.from_dict(metadata_dict)

        # Run
        fake_8_ean = metadata._get_faker_method(Faker(), ('ean', 8))
        ean_8 = fake_8_ean()

        fake_13_ean = metadata._get_faker_method(Faker(), ('ean', 13))
        ean_13 = fake_13_ean()

        # Assert
        assert len(ean_8) == 8
        assert len(ean_13) == 13
예제 #8
0
파일: test_table.py 프로젝트: csala/SDV
    def test__make_ids_unique_field_index_out_of_order(self):
        """Test that updated id column is unique even if index is out of order."""
        metadata_dict = {
            'fields': {
                'item 0': {
                    'type': 'id',
                    'subtype': 'integer'
                },
                'item 1': {
                    'type': 'boolean'
                }
            },
            'primary_key': 'item 0'
        }
        metadata = Table.from_dict(metadata_dict)
        data = pd.DataFrame(
            {
                'item 0': [0, 1, 1, 2, 3, 5, 5, 6],
                'item 1': [True, True, False, False, True, False, False, True]
            },
            index=[0, 1, 1, 2, 3, 5, 5, 6])

        new_data = metadata.make_ids_unique(data)

        assert new_data['item 1'].equals(data['item 1'])
        assert new_data['item 0'].is_unique
예제 #9
0
    def __init__(self, field_names=None, field_types=None, field_transformers=None,
                 anonymize_fields=None, primary_key=None, constraints=None, table_metadata=None,
                 distribution=None, default_distribution=None, categorical_transformer=None):

        if isinstance(table_metadata, dict):
            table_metadata = Table.from_dict(table_metadata)

        if table_metadata:
            model_kwargs = table_metadata.get_model_kwargs(self.__class__.__name__)
            if model_kwargs:
                if distribution is None:
                    distribution = model_kwargs['distribution']

                if categorical_transformer is None:
                    categorical_transformer = model_kwargs['categorical_transformer']

        self._distribution = distribution
        self._default_distribution = default_distribution or 'parametric'

        categorical_transformer = categorical_transformer or self._DEFAULT_TRANSFORMER
        self._categorical_transformer = categorical_transformer
        self._DTYPE_TRANSFORMERS = {'O': categorical_transformer}

        super().__init__(
            field_names=field_names,
            primary_key=primary_key,
            field_types=field_types,
            anonymize_fields=anonymize_fields,
            constraints=constraints,
            table_metadata=table_metadata
        )
예제 #10
0
    def test_make_ids_unique_field_already_unique(self):
        """Test that id column is kept if already unique."""
        metadata_dict = {
            'fields': {
                'item 0': {'type': 'id', 'subtype': 'integer'},
                'item 1': {'type': 'boolean'}
            },
            'primary_key': 'item 0'
        }
        metadata = Table.from_dict(metadata_dict)
        data = pd.DataFrame({
            'item 0': [9, 1, 8, 2, 3, 7, 5, 6],
            'item 1': [True, True, False, False, True, False, False, True]
        })

        new_data = metadata.make_ids_unique(data)

        assert new_data['item 1'].equals(data['item 1'])
        assert new_data['item 0'].equals(data['item 0'])
예제 #11
0
    def test_make_ids_unique_field_not_unique(self):
        """Test that id column is replaced with all unique values if not already unique."""
        metadata_dict = {
            'fields': {
                'item 0': {'type': 'id', 'subtype': 'integer'},
                'item 1': {'type': 'boolean'}
            },
            'primary_key': 'item 0'
        }
        metadata = Table.from_dict(metadata_dict)
        data = pd.DataFrame({
            'item 0': [0, 1, 1, 2, 3, 5, 5, 6],
            'item 1': [True, True, False, False, True, False, False, True]
        })

        new_data = metadata.make_ids_unique(data)

        assert new_data['item 1'].equals(data['item 1'])
        assert new_data['item 0'].is_unique
예제 #12
0
파일: base.py 프로젝트: sdv-dev/SDV
    def __init__(self,
                 field_names=None,
                 field_types=None,
                 field_transformers=None,
                 anonymize_fields=None,
                 primary_key=None,
                 constraints=None,
                 table_metadata=None,
                 rounding='auto',
                 min_value='auto',
                 max_value='auto'):
        if table_metadata is None:
            self._metadata = Table(field_names=field_names,
                                   primary_key=primary_key,
                                   field_types=field_types,
                                   field_transformers=field_transformers,
                                   anonymize_fields=anonymize_fields,
                                   constraints=constraints,
                                   dtype_transformers=self._DTYPE_TRANSFORMERS,
                                   rounding=rounding,
                                   min_value=min_value,
                                   max_value=max_value)
            self._metadata_fitted = False
        else:
            table_metadata = deepcopy(table_metadata)
            for arg in (field_names, primary_key, field_types,
                        anonymize_fields, constraints):
                if arg:
                    raise ValueError(
                        'If table_metadata is given {} must be None'.format(
                            arg.__name__))

            if isinstance(table_metadata, dict):
                table_metadata = Table.from_dict(table_metadata)

            table_metadata._dtype_transformers.update(self._DTYPE_TRANSFORMERS)

            self._metadata = table_metadata
            self._metadata_fitted = table_metadata.fitted
예제 #13
0
def load_tabular_demo(dataset_name=None,
                      table_name=None,
                      data_path=DATA_PATH,
                      metadata=False):
    """Load a tabular demo.

    If a dataset name is given, it is downloaded from the sdv-datasets S3 bucket.
    Otherwise, a toy dataset with a single table that contains data from a short fake
    collection of employees.

    If ``metadata`` is ``True``, the output will be a tuple with a ``Metadata``
    instance for the dataset and a ``pandas.DataFrame`` with the data from the table.
    If ``metadata`` is ``False``, only the ``pandas.DataFrame`` is returned.

    Args:
        dataset_name (str):
            Dataset name to be downloaded, if ``None`` use default demo data. Defaults to ``None``.
        table_name (str):
            If a table name is given, return this table from the indicated dataset.
            Otherwise, return the first one.
        data_path (str):
            Data path to save the dataset files, only used if dataset_name is provided.
            Defaults to ``DATA_PATH``.
        metadata (bool):
            If ``True`` also return a Table object. Defaults to ``False``.

    Returns:
        pandas.DataFrame or tuple:
            If ``metadata`` is ``False`` return a ``pandas.DataFrame`` with the tables data.
            If ``metadata`` is ``True`` return a ``tuple`` with a Table and the data.
    """
    if dataset_name:
        meta, tables = _load_demo_dataset(dataset_name, data_path)

        if table_name is None:
            table_name = meta.get_tables()[0]

        table = _dtypes64(tables[table_name])

        if metadata:
            return Table.from_dict(meta.get_table_meta(table_name)), table

        return table

    table = _dtypes64(_load_tabular_dummy())
    if metadata:
        table_meta = Table.from_dict({
            'fields': {
                'company': {
                    'type': 'categorical'
                },
                'department': {
                    'type': 'categorical'
                },
                'name': {
                    'type': 'categorical'
                },
                'address': {
                    'type': 'categorical'
                },
                'age': {
                    'type': 'numerical',
                    'subtype': 'integer'
                },
                'age_when_joined': {
                    'type': 'numerical',
                    'subtype': 'integer'
                },
                'years_in_the_company': {
                    'type': 'numerical',
                    'subtype': 'integer'
                }
            },
            'constraints': [{
                'constraint': 'UniqueCombinations',
                'columns': ['company', 'department'],
            }, {
                'constraint': 'GreaterThan',
                'low': 'age_when_joined',
                'high': 'age'
            }],
            'model_kwargs': {}
        })
        return table_meta, table

    return table
예제 #14
0
def test_table():
    """Test Table with pii and non-pii columns."""
    num_values = 100

    metadata_dict = {
        'fields': {
            'years_employed': {
                'type': 'numerical',
                'subtype': 'integer',
            },
            'ssn': {
                'type': 'categorical',
                'pii': True,
                'pii_category': 'ssn',
            },
            'company_US': {
                'type': 'categorical',
                'pii': True,
                'pii_category': 'company',
                'pii_locales': ['en_US'],
            },
            'company_US_CN': {
                'type': 'categorical',
                'pii': True,
                'pii_category': 'company',
                'pii_locales': ['en_US', 'zh_CN'],
            },
        },
    }
    data = pd.DataFrame({
        'years_employed': np.random.choice(20, num_values),
        'ssn': [str(i) for i in range(num_values)],
        'company_US': [str(i) for i in range(num_values)],
        'company_US_CN': [str(i) for i in range(num_values)],
    })
    metadata = Table.from_dict(metadata_dict)

    metadata.fit(data)

    transformed = metadata.transform(data)
    assert transformed.dtypes.isin([np.dtype('int32'), np.dtype('int64')]).all()

    reverse_transformed = metadata.reverse_transform(transformed)

    sampled_years_employed = reverse_transformed['years_employed']
    assert sampled_years_employed.dtype == 'int'
    assert ((sampled_years_employed >= 0) & (sampled_years_employed < 20)).all()

    sampled_ssn = reverse_transformed['ssn']
    ssn_regex = re.compile(r'^\d\d\d-\d\d-\d\d\d\d$')
    assert sampled_ssn.dtype == 'object'
    assert sampled_ssn.str.match(ssn_regex).all()

    sampled_company_US = reverse_transformed['company_US']
    assert sampled_company_US.dtype == 'object'
    # Check that all companies are sampled from the `en_US` locale
    assert ((sampled_company_US > u'\u0000') & (sampled_company_US < u'\u007F')).all()

    sampled_company_US_CN = reverse_transformed['company_US_CN']
    assert sampled_company_US_CN.dtype == 'object'
    # Check that we have sampled companies from the `en_US` locale
    assert ((sampled_company_US_CN > u'\u0000') & (sampled_company_US_CN < u'\u007F')).any()
    # Check that we have sampled companies from the `zh_CH` locale
    assert ((sampled_company_US_CN > u'\u4e00') & (sampled_company_US_CN < u'\u9fff')).any()