Пример #1
0
def _score_dataset(dataset, datasets_path, output):
    start = datetime.now()

    try:
        if datasets_path is None:
            metadata, tables = load_demo(dataset, metadata=True)
        else:
            metadata = Metadata(
                os.path.join(datasets_path, dataset, 'metadata.json'))
            tables = metadata.load_tables()

        sdv = SDV()
        LOGGER.info('Modeling dataset %s', dataset)
        sdv.fit(metadata, tables)

        LOGGER.info('Sampling dataset %s', dataset)
        sampled = sdv.sample_all(10)

        LOGGER.info('Evaluating dataset %s', dataset)
        score = evaluate(sampled, metadata=metadata)

        LOGGER.info('%s: %s - ELAPSED: %s', dataset, score,
                    datetime.now() - start)
        output.update({
            'dataset': dataset,
            'score': score,
        })

    except Exception as ex:
        error = '{}: {}'.format(type(ex).__name__, str(ex))
        LOGGER.error('%s: %s - ELAPSED: %s', dataset, error,
                     datetime.now() - start)
        output.update({'dataset': dataset, 'error': error})
Пример #2
0
def test_ctgan():
    users = load_demo(metadata=False)['users']

    ctgan = CTGAN(primary_key='user_id', epochs=1)
    ctgan.fit(users)

    sampled = ctgan.sample()

    # test shape is right
    assert sampled.shape == users.shape

    # test user_id has been generated as an ID field
    assert list(sampled['user_id']) == list(range(0, len(users)))

    assert ctgan.get_metadata().to_dict() == {
        'fields': {
            'user_id': {
                'type': 'id',
                'subtype': 'integer'
            },
            'country': {
                'type': 'categorical'
            },
            'gender': {
                'type': 'categorical'
            },
            'age': {
                'type': 'numerical',
                'subtype': 'integer'
            }
        },
        'constraints': [],
        'model_kwargs': {}
    }
Пример #3
0
def test_gaussian_copula():
    users = load_demo(metadata=False)['users']

    field_types = {
        'age': {
            'type': 'numerical',
            'subtype': 'integer',
        },
        'country': {
            'type': 'categorical'
        }
    }
    anonymize_fields = {'country': 'country_code'}

    gc = GaussianCopula(
        field_names=['user_id', 'country', 'gender', 'age'],
        field_types=field_types,
        primary_key='user_id',
        anonymize_fields=anonymize_fields,
        categorical_transformer='one_hot_encoding',
    )
    gc.fit(users)

    parameters = gc.get_parameters()
    new_gc = GaussianCopula(
        table_metadata=gc.get_metadata(),
        categorical_transformer='one_hot_encoding',
    )
    new_gc.set_parameters(parameters)

    sampled = new_gc.sample()

    # test shape is right
    assert sampled.shape == users.shape

    # test user_id has been generated as an ID field
    assert list(sampled['user_id']) == list(range(0, len(users)))

    # country codes have been replaced with new ones
    assert set(sampled.country.unique()) != set(users.country.unique())

    metadata = gc.get_metadata().to_dict()
    assert metadata['fields'] == {
        'user_id': {
            'type': 'id',
            'subtype': 'integer'
        },
        'country': {
            'type': 'categorical'
        },
        'gender': {
            'type': 'categorical'
        },
        'age': {
            'type': 'numerical',
            'subtype': 'integer'
        }
    }

    assert 'model_kwargs' in metadata
Пример #4
0
def test_recreate():
    data = load_demo(metadata=False)['users']

    # If distribution is non parametric, get_parameters fails
    model = CTGAN(epochs=1)
    model.fit(data)
    sampled = model.sample()

    assert sampled.shape == data.shape
    assert (sampled.dtypes == data.dtypes).all()
    assert (sampled.notnull().sum(axis=1) != 0).all()

    # Metadata
    model_meta = CTGAN(epochs=1, table_metadata=model.get_metadata())
    model_meta.fit(data)
    sampled = model_meta.sample()

    assert sampled.shape == data.shape
    assert (sampled.dtypes == data.dtypes).all()
    assert (sampled.notnull().sum(axis=1) != 0).all()

    # Metadata dict
    model_meta_dict = CTGAN(epochs=1,
                            table_metadata=model.get_metadata().to_dict())
    model_meta_dict.fit(data)
    sampled = model_meta_dict.sample()

    assert sampled.shape == data.shape
    assert (sampled.dtypes == data.dtypes).all()
    assert (sampled.notnull().sum(axis=1) != 0).all()
Пример #5
0
def test_copulagan():
    users = load_demo(metadata=False)['users']

    model = CopulaGAN(
        primary_key='user_id',
        epochs=1,
        field_distributions={
            'age': 'beta'
        },
        default_distribution='bounded'
    )
    model.fit(users)

    sampled = model.sample()

    # test shape is right
    assert sampled.shape == users.shape

    # test user_id has been generated as an ID field
    assert list(sampled['user_id']) == list(range(0, len(users)))

    assert model.get_metadata().to_dict() == {
        'fields': {
            'user_id': {
                'type': 'id',
                'subtype': 'integer',
                'transformer': 'integer',
            },
            'country': {
                'type': 'categorical',
                'transformer': 'label_encoding',
            },
            'gender': {
                'type': 'categorical',
                'transformer': 'label_encoding',
            },
            'age': {
                'type': 'numerical',
                'subtype': 'integer',
                'transformer': 'integer',
            }
        },
        'primary_key': 'user_id',
        'constraints': [],
        'sequence_index': None,
        'context_columns': [],
        'entity_columns': [],
        'model_kwargs': {},
        'name': None
    }
Пример #6
0
def test_ctgan():
    users = load_demo(metadata=False)['users']

    ctgan = CTGAN(
        primary_key='user_id',
        epochs=1
    )
    ctgan.fit(users)

    sampled = ctgan.sample()

    # test shape is right
    assert sampled.shape == users.shape

    # test user_id has been generated as an ID field
    assert list(sampled['user_id']) == list(range(0, len(users)))

    expected_metadata = {
        'fields': {
            'user_id': {
                'type': 'id',
                'subtype': 'integer',
                'transformer': 'integer',
            },
            'country': {
                'type': 'categorical',
                'transformer': None,
            },
            'gender': {
                'type': 'categorical',
                'transformer': None,
            },
            'age': {
                'type': 'numerical',
                'subtype': 'integer',
                'transformer': 'integer',
            }
        },
        'primary_key': 'user_id',
        'constraints': [],
        'sequence_index': None,
        'context_columns': [],
        'entity_columns': [],
        'model_kwargs': {},
        'name': None
    }
    assert ctgan.get_metadata().to_dict() == expected_metadata
Пример #7
0
def test_integer_categoricals():
    """Ensure integer categoricals are still sampled as integers.

    The origin of this tests can be found in the github issue #194:
    https://github.com/sdv-dev/SDV/issues/194
    """
    users = load_demo(metadata=False)['users']

    field_types = {
        'age': {
            'type': 'categorical',
        },
    }
    gc = GaussianCopula(field_types=field_types, categorical_transformer='categorical')
    gc.fit(users)

    sampled = gc.sample()

    assert users['age'].dtype == np.int64
    assert sampled['age'].dtype == np.int64
Пример #8
0
def test_sdv_model_kwargs():
    metadata, tables = load_demo(metadata=True)
    tables = {'users': tables['users']}
    metadata = metadata.to_dict()
    del metadata['tables']['sessions']
    del metadata['tables']['transactions']

    hma = HMA1(metadata, model=GaussianCopula, model_kwargs={
        'default_distribution': 'beta',
        'categorical_transformer': 'label_encoding',
    })
    hma.fit(tables)

    model = hma._models['users']
    assert model._default_distribution == BetaUnivariate
    assert model._DTYPE_TRANSFORMERS['O'] == 'label_encoding'
    assert isinstance(
        model._metadata._hyper_transformer._transformers['gender'],
        rdt.transformers.categorical.LabelEncodingTransformer
    )
Пример #9
0
def test_gaussian_copula():
    users = load_demo(metadata=False)['users']

    field_types = {
        'age': {
            'type': 'numerical',
            'subtype': 'integer',
        },
        'country': {
            'type': 'categorical'
        }
    }
    anonymize_fields = {'country': 'country_code'}

    # If distribution is non parametric, get_parameters fails
    gc = GaussianCopula(
        field_names=['user_id', 'country', 'gender', 'age'],
        field_types=field_types,
        primary_key='user_id',
        anonymize_fields=anonymize_fields,
        field_distributions={'age': 'gamma'},
        default_distribution='gaussian_kde',
    )
    gc.fit(users)
    with pytest.raises(NonParametricError):
        parameters = gc.get_parameters()

    # If distribution is parametric, copula can be recreated
    gc = GaussianCopula(
        field_names=['user_id', 'country', 'gender', 'age'],
        field_types=field_types,
        primary_key='user_id',
        anonymize_fields=anonymize_fields,
        field_distributions={'age': 'gamma'},
        default_distribution='bounded',
    )
    gc.fit(users)

    parameters = gc.get_parameters()
    new_gc = GaussianCopula(table_metadata=gc.get_metadata(), )
    new_gc.set_parameters(parameters)

    # Validate sampled dat
    sampled = new_gc.sample()

    # test shape is right
    assert sampled.shape == users.shape

    # test user_id has been generated as an ID field
    assert list(sampled['user_id']) == list(range(0, len(users)))

    # country codes have been replaced with new ones
    assert set(sampled.country.unique()) != set(users.country.unique())

    # Validate metadata
    metadata = gc.get_metadata().to_dict()
    assert metadata['fields'] == {
        'user_id': {
            'type': 'id',
            'subtype': 'integer',
            'transformer': 'integer',
        },
        'country': {
            'type': 'categorical',
            'pii': True,
            'pii_category': 'country_code',
            'transformer': 'one_hot_encoding',
        },
        'gender': {
            'type': 'categorical',
            'transformer': 'one_hot_encoding',
        },
        'age': {
            'type': 'numerical',
            'subtype': 'integer',
            'transformer': 'integer',
        }
    }

    assert 'model_kwargs' in metadata
    assert 'GaussianCopula' in metadata['model_kwargs']