示例#1
0
def test_evaluate_tables_from_demo():
    tables = load_demo(metadata=False)

    new_meta = Metadata()
    new_meta.add_table('users', data=tables['users'], primary_key='user_id')
    new_meta.add_table('sessions',
                       data=tables['sessions'],
                       primary_key='session_id',
                       parent='users',
                       foreign_key='user_id')
    transactions_fields = {
        'timestamp': {
            'type': 'datetime',
            'format': '%Y-%m-%d'
        }
    }
    new_meta.add_table('transactions',
                       tables['transactions'],
                       fields_metadata=transactions_fields,
                       primary_key='transaction_id',
                       parent='sessions')

    sdv = SDV()
    sdv.fit(new_meta, tables=tables)

    sampled = sdv.sample_all()

    table_scores = dict()
    for table in new_meta.get_tables():
        table_scores[table] = evaluate(sampled[table],
                                       real=tables[table],
                                       metadata=new_meta,
                                       table_name=table)

    evaluate(sampled, real=tables, metadata=new_meta)
示例#2
0
文件: benchmark.py 项目: zyteka/SDV
def _score_dataset(dataset, datasets_path, output):
    start = datetime.now()

    try:
        if datasets_path is None:
            metadata, tables = load_demo(dataset, metadata=True)
        else:
            metadata = Metadata(
                os.path.join(datasets_path, dataset, 'metadata.json'))
            tables = metadata.load_tables()

        sdv = SDV()
        LOGGER.info('Modeling dataset %s', dataset)
        sdv.fit(metadata, tables)

        LOGGER.info('Sampling dataset %s', dataset)
        sampled = sdv.sample_all(10)

        LOGGER.info('Evaluating dataset %s', dataset)
        score = evaluate(sampled, metadata=metadata)

        LOGGER.info('%s: %s - ELAPSED: %s', dataset, score,
                    datetime.now() - start)
        output.update({
            'dataset': dataset,
            'score': score,
        })

    except Exception as ex:
        error = '{}: {}'.format(type(ex).__name__, str(ex))
        LOGGER.error('%s: %s - ELAPSED: %s', dataset, error,
                     datetime.now() - start)
        output.update({'dataset': dataset, 'error': error})
示例#3
0
def test_sdv_multiparent():
    metadata, tables = load_demo('got_families', metadata=True)

    sdv = SDV()
    sdv.fit(metadata, tables)

    # Sample all
    sampled = sdv.sample()

    assert set(sampled.keys()) == {'characters', 'families', 'character_families'}
    assert len(sampled['characters']) == 7

    # Sample with children
    sampled = sdv.sample('characters', reset_primary_keys=True)

    assert set(sampled.keys()) == {'characters', 'character_families'}
    assert len(sampled['characters']) == 7
    assert 'family_id' in sampled['character_families']

    # Sample without children
    characters = sdv.sample('characters', sample_children=False)

    assert characters.shape == tables['characters'].shape
    assert set(characters.columns) == set(tables['characters'].columns)

    families = sdv.sample('families', sample_children=False)

    assert families.shape == tables['families'].shape
    assert set(families.columns) == set(tables['families'].columns)

    character_families = sdv.sample('character_families', sample_children=False)

    assert character_families.shape == tables['character_families'].shape
    assert set(character_families.columns) == set(tables['character_families'].columns)
示例#4
0
def test_sdv():
    metadata, tables = load_demo(metadata=True)

    sdv = SDV()
    sdv.fit(metadata, tables)

    # Sample all
    sampled = sdv.sample()

    assert set(sampled.keys()) == {'users', 'sessions', 'transactions'}
    assert len(sampled['users']) == 10

    # Sample with children
    sampled = sdv.sample('users', reset_primary_keys=True)

    assert set(sampled.keys()) == {'users', 'sessions', 'transactions'}
    assert len(sampled['users']) == 10

    # Sample without children
    users = sdv.sample('users', sample_children=False)

    assert users.shape == tables['users'].shape
    assert set(users.columns) == set(tables['users'].columns)

    sessions = sdv.sample('sessions', sample_children=False)

    assert sessions.shape == tables['sessions'].shape
    assert set(sessions.columns) == set(tables['sessions'].columns)

    transactions = sdv.sample('transactions', sample_children=False)

    assert transactions.shape == tables['transactions'].shape
    assert set(transactions.columns) == set(tables['transactions'].columns)
示例#5
0
    def test_integration(self):
        metadata, tables = load_demo(metadata=True)

        sdv = SDV()
        sdv.fit(metadata, tables)
        synthetic = sdv.sample_all(20)

        metrics = evaluate(metadata, tables, synthetic)
        metrics.overall()
        metrics.details()
        metrics.highlights()
示例#6
0
def run_example():
    """Example of usage of SDV for tables contanining more than one foreign key."""
    # Setup
    vault = SDV('data/meta.json')
    vault.fit()

    # Run
    result = vault.sample_all()

    for name, table in result.items():
        print('Samples generated for table {}:\n{}\n'.format(name, table.head(5)))
示例#7
0
文件: test_sdv.py 项目: LilyX2021/SDV
    def test_sample_table_not_fitted(self):
        """Check that the sample_table raise an exception when is not fitted."""

        # Setup

        # Run and asserts
        sdv_mock = mock.Mock()
        sdv_mock.sampler = None

        table_name = 'DEMO'

        with self.assertRaises(NotFittedError):
            SDV.sample_table(sdv_mock, table_name)
示例#8
0
    def test__check_unsupported_raises(self):
        """_check_unsupported will raise a ValueError if a table has two parents."""
        # Setup
        instance = SDV(meta_file_name='meta.json')

        data_navigator_mock = mock.MagicMock()
        data_navigator_mock.tables.keys.return_value = ['A', 'B']
        data_navigator_mock.get_parents.return_value = ['X', 'Y']

        instance.dn = data_navigator_mock

        # Run / Check
        with self.assertRaises(ValueError):
            instance._check_unsupported_dataset_structure()
示例#9
0
文件: test_sdv.py 项目: surajitdb/SDV
def test_sdv_multi_foreign_key():
    """Ensure multi-foreign-key datasets are properly covered.

    Multi-foreign-key datasets are those that have one table with
    2 foreign keys to the same parent.
    """
    metadata, tables = datasets.load_multi_foreign_key()

    sdv = SDV()
    sdv.fit(metadata, tables)

    # Sample all
    sampled = sdv.sample()

    assert set(sampled.keys()) == {'parent', 'child'}
    assert len(sampled['parent']) == 10
示例#10
0
def test_integer_categoricals():
    """Ensure integer categoricals are still sampled as integers.

    The origin of this tests can be found in the github issue #194:
    https://github.com/sdv-dev/SDV/issues/194
    """
    metadata, tables = load_demo(metadata=True)
    metadata_dict = metadata.to_dict()
    metadata_dict['tables']['users']['fields']['age'] = {'type': 'categorical'}

    sdv = SDV()
    sdv.fit(metadata, tables)
    sampled = sdv.sample()

    for name, table in tables.items():
        assert (sampled[name].dtypes == table.dtypes).all()
示例#11
0
文件: test_sdv.py 项目: LilyX2021/SDV
    def test_sample_table_fitted(self):
        """Check that the sample_table is called."""

        # Setup

        # Run
        sdv_mock = mock.Mock()
        sdv_mock.sampler = mock.Mock()

        table_name = 'DEMO'

        SDV.sample_table(sdv_mock, table_name)

        # Asserts
        sdv_mock.sampler.sample_table.assert_called_once_with(
            'DEMO', reset_primary_keys=False)
示例#12
0
文件: test_sdv.py 项目: LilyX2021/SDV
    def test_sample_rows_fitted(self):
        """Check that the sample_rows is called."""

        # Setup

        # Run
        sdv_mock = mock.Mock()
        sdv_mock.sampler = mock.Mock()

        table_name = 'DEMO'
        num_rows = 5

        SDV.sample_rows(sdv_mock, table_name, num_rows)

        # Asserts
        sdv_mock.sampler.sample_rows.assert_called_once_with(
            'DEMO', 5, sample_children=True, reset_primary_keys=False)
示例#13
0
def fit_save_model(mfile):
    sdv = SDV()
    ''' Original Data '''
    users = pd.DataFrame({
        'user_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        'country': ['USA', 'UK', 'ES', 'UK', 'USA', 'DE', 'BG', 'ES', 'FR', 'UK'],
        'gender': ['M', 'F', None, 'M', 'F', 'M', 'F', None, 'F', None],
        'age': [34, 23, 44, 22, 54, 57, 45, 41, 23, 30]
    })

    tables = {
        'users': users
    }

    with open('./user_table_metadata.json') as metadata_file:
        metadata = json.load(metadata_file)
        sdv.fit(metadata, tables)
        sdv.save(mfile)
示例#14
0
"""
Running the SDV basic tutorial using their example dataset.
"""
from sdv import load_demo
from sdv import SDV

# Grab the demo data
metadata, tables = load_demo(metadata=True)
print(metadata)

# Run the basic fit
sdv = SDV()
sdv.fit(metadata, tables)
print("done fit")
sdv.save('sdv.pkl')


示例#15
0
def generateSyntheticData(mfile):
    sdv = SDV.load(mfile)
    samples = sdv.sample_all(10)
    return samples
示例#16
0
def createmodel(tables, mfile):
    sdv = SDV()
    with open('./join_table_metadata.json') as metadata_file:
        metadata = json.load(metadata_file)
        sdv.fit(metadata, tables)
        sdv.save(mfile)
示例#17
0
def genSyntheticData(mfile, rowcount):
    sdv = SDV.load(mfile)
    samples = sdv.sample_all(rowcount)
    df = pd.DataFrame(samples['users'])
    return df