示例#1
0
文件: benchmark.py 项目: zyteka/SDV
def _score_dataset(dataset, datasets_path, output):
    start = datetime.now()

    try:
        if datasets_path is None:
            metadata, tables = load_demo(dataset, metadata=True)
        else:
            metadata = Metadata(
                os.path.join(datasets_path, dataset, 'metadata.json'))
            tables = metadata.load_tables()

        sdv = SDV()
        LOGGER.info('Modeling dataset %s', dataset)
        sdv.fit(metadata, tables)

        LOGGER.info('Sampling dataset %s', dataset)
        sampled = sdv.sample_all(10)

        LOGGER.info('Evaluating dataset %s', dataset)
        score = evaluate(sampled, metadata=metadata)

        LOGGER.info('%s: %s - ELAPSED: %s', dataset, score,
                    datetime.now() - start)
        output.update({
            'dataset': dataset,
            'score': score,
        })

    except Exception as ex:
        error = '{}: {}'.format(type(ex).__name__, str(ex))
        LOGGER.error('%s: %s - ELAPSED: %s', dataset, error,
                     datetime.now() - start)
        output.update({'dataset': dataset, 'error': error})
示例#2
0
def test_evaluate_tables_from_demo():
    tables = load_demo(metadata=False)

    new_meta = Metadata()
    new_meta.add_table('users', data=tables['users'], primary_key='user_id')
    new_meta.add_table('sessions',
                       data=tables['sessions'],
                       primary_key='session_id',
                       parent='users',
                       foreign_key='user_id')
    transactions_fields = {
        'timestamp': {
            'type': 'datetime',
            'format': '%Y-%m-%d'
        }
    }
    new_meta.add_table('transactions',
                       tables['transactions'],
                       fields_metadata=transactions_fields,
                       primary_key='transaction_id',
                       parent='sessions')

    sdv = SDV()
    sdv.fit(new_meta, tables=tables)

    sampled = sdv.sample_all()

    table_scores = dict()
    for table in new_meta.get_tables():
        table_scores[table] = evaluate(sampled[table],
                                       real=tables[table],
                                       metadata=new_meta,
                                       table_name=table)

    evaluate(sampled, real=tables, metadata=new_meta)
示例#3
0
def test_sdv():
    metadata, tables = load_demo(metadata=True)

    sdv = SDV()
    sdv.fit(metadata, tables)

    # Sample all
    sampled = sdv.sample_all()

    assert set(sampled.keys()) == {'users', 'sessions', 'transactions'}
    assert len(sampled['users']) == 10

    # Sample with children
    sampled = sdv.sample('users', reset_primary_keys=True)

    assert set(sampled.keys()) == {'users', 'sessions', 'transactions'}
    assert len(sampled['users']) == 10

    # Sample without children
    users = sdv.sample('users', sample_children=False)

    assert users.shape == tables['users'].shape
    assert set(users.columns) == set(tables['users'].columns)

    sessions = sdv.sample('sessions', sample_children=False)

    assert sessions.shape == tables['sessions'].shape
    assert set(sessions.columns) == set(tables['sessions'].columns)

    transactions = sdv.sample('transactions', sample_children=False)

    assert transactions.shape == tables['transactions'].shape
    assert set(transactions.columns) == set(tables['transactions'].columns)
示例#4
0
def run_example():
    """Example of usage of SDV for tables contanining more than one foreign key."""
    # Setup
    vault = SDV('data/meta.json')
    vault.fit()

    # Run
    result = vault.sample_all()

    for name, table in result.items():
        print('Samples generated for table {}:\n{}\n'.format(name, table.head(5)))
示例#5
0
    def test_integration(self):
        metadata, tables = load_demo(metadata=True)

        sdv = SDV()
        sdv.fit(metadata, tables)
        synthetic = sdv.sample_all(20)

        metrics = evaluate(metadata, tables, synthetic)
        metrics.overall()
        metrics.details()
        metrics.highlights()
示例#6
0
def test_sdv_multiparent():
    metadata, tables = load_demo('got_families', metadata=True)

    sdv = SDV()
    sdv.fit(metadata, tables)

    # Sample all
    sampled = sdv.sample_all()

    assert set(
        sampled.keys()) == {'characters', 'families', 'character_families'}
    assert len(sampled['characters']) == 7

    # Sample with children
    sampled = sdv.sample('characters', reset_primary_keys=True)

    assert set(sampled.keys()) == {'characters', 'character_families'}
    assert len(sampled['characters']) == 7
    assert 'family_id' in sampled['character_families']

    # Sample without children
    characters = sdv.sample('characters', sample_children=False)

    assert characters.shape == tables['characters'].shape
    assert set(characters.columns) == set(tables['characters'].columns)

    families = sdv.sample('families', sample_children=False)

    assert families.shape == tables['families'].shape
    assert set(families.columns) == set(tables['families'].columns)

    character_families = sdv.sample('character_families',
                                    sample_children=False)

    assert character_families.shape == tables['character_families'].shape
    assert set(character_families.columns) == set(
        tables['character_families'].columns)