예제 #1
0
    def _load_metadata(self):
        dataset_path = os.path.join(DATA_DIR, self.name)
        metadata_path = os.path.join(dataset_path, 'metadata.json')

        try:
            self.metadata = Metadata(metadata_path)
            version = self.metadata.get_table_meta(
                self.table_name)['deepecho_version']
            assert version == self.VERSION
        except Exception:
            self._download()
            self.metadata = Metadata(metadata_path)
예제 #2
0
파일: benchmark.py 프로젝트: zyteka/SDV
def _score_dataset(dataset, datasets_path, output):
    start = datetime.now()

    try:
        if datasets_path is None:
            metadata, tables = load_demo(dataset, metadata=True)
        else:
            metadata = Metadata(
                os.path.join(datasets_path, dataset, 'metadata.json'))
            tables = metadata.load_tables()

        sdv = SDV()
        LOGGER.info('Modeling dataset %s', dataset)
        sdv.fit(metadata, tables)

        LOGGER.info('Sampling dataset %s', dataset)
        sampled = sdv.sample_all(10)

        LOGGER.info('Evaluating dataset %s', dataset)
        score = evaluate(sampled, metadata=metadata)

        LOGGER.info('%s: %s - ELAPSED: %s', dataset, score,
                    datetime.now() - start)
        output.update({
            'dataset': dataset,
            'score': score,
        })

    except Exception as ex:
        error = '{}: {}'.format(type(ex).__name__, str(ex))
        LOGGER.error('%s: %s - ELAPSED: %s', dataset, error,
                     datetime.now() - start)
        output.update({'dataset': dataset, 'error': error})
예제 #3
0
def test_evaluate_tables_from_demo():
    tables = load_demo(metadata=False)

    new_meta = Metadata()
    new_meta.add_table('users', data=tables['users'], primary_key='user_id')
    new_meta.add_table('sessions',
                       data=tables['sessions'],
                       primary_key='session_id',
                       parent='users',
                       foreign_key='user_id')
    transactions_fields = {
        'timestamp': {
            'type': 'datetime',
            'format': '%Y-%m-%d'
        }
    }
    new_meta.add_table('transactions',
                       tables['transactions'],
                       fields_metadata=transactions_fields,
                       primary_key='transaction_id',
                       parent='sessions')

    sdv = SDV()
    sdv.fit(new_meta, tables=tables)

    sampled = sdv.sample_all()

    table_scores = dict()
    for table in new_meta.get_tables():
        table_scores[table] = evaluate(sampled[table],
                                       real=tables[table],
                                       metadata=new_meta,
                                       table_name=table)

    evaluate(sampled, real=tables, metadata=new_meta)
예제 #4
0
def test_build_demo_metadata_from_tables():
    """Build metadata from the demo tables.

    Then compare the built metadata with the demo one
    to make sure that they are the same.
    """
    tables = load_demo(metadata=False)

    new_meta = Metadata()
    new_meta.add_table('users', data=tables['users'], primary_key='user_id')
    new_meta.add_table('sessions',
                       data=tables['sessions'],
                       primary_key='session_id',
                       parent='users',
                       foreign_key='user_id')
    transactions_fields = {
        'timestamp': {
            'type': 'datetime',
            'format': '%Y-%m-%dT%H:%M'
        }
    }
    new_meta.add_table('transactions',
                       tables['transactions'],
                       fields_metadata=transactions_fields,
                       primary_key='transaction_id',
                       parent='sessions')

    assert DEMO_METADATA == new_meta.to_dict()
예제 #5
0
def test_build_demo_metadata_without_tables():
    metadata = Metadata()

    metadata.add_table('users')
    metadata.add_field('users', 'user_id', 'id', 'integer')
    metadata.add_field('users', 'country', 'categorical')
    metadata.add_field('users', 'gender', 'categorical')
    metadata.add_field('users', 'age', 'numerical', 'integer')
    metadata.set_primary_key('users', 'user_id')

    metadata.add_table('sessions')
    metadata.add_field('sessions', 'session_id', 'id', 'integer')
    metadata.add_field('sessions', 'user_id', 'id', 'integer')
    metadata.add_field('sessions', 'device', 'categorical')
    metadata.add_field('sessions', 'os', 'categorical')
    metadata.add_field('sessions', 'minutes', 'numerical', 'integer')
    metadata.set_primary_key('sessions', 'session_id')
    metadata.add_relationship('users', 'sessions')

    metadata.add_table('transactions')
    metadata.add_field('transactions', 'transaction_id', 'id', 'integer')
    metadata.add_field('transactions', 'session_id', 'id', 'integer')
    metadata.add_field('transactions',
                       'timestamp',
                       'datetime',
                       properties={'format': '%Y-%m-%dT%H:%M'})
    metadata.add_field('transactions', 'amount', 'numerical', 'float')
    metadata.add_field('transactions', 'cancelled', 'boolean')
    metadata.set_primary_key('transactions', 'transaction_id')
    metadata.add_relationship('sessions', 'transactions')

    assert DEMO_METADATA == metadata.to_dict()
예제 #6
0
def make_dataset(name,
                 data,
                 table_name=None,
                 entity_columns=None,
                 sequence_index=None,
                 datasets_path='.'):
    """Make a Dataset from a DataFrame.

    Args:
        name (str):
            Name of this dataset.
        data (pandas.DataFrame or str):
            Data passed as a DataFrame or as a path to a CSV file.
        table_name (str or None):
            Optionally give the table a different name.
        entity_columns (list or None):
            (Optional) List of names of the columns that form the entity_id of this
            dataset. If ``None`` (default), no entity columns are set.
        sequence_index (str or None):
            (Optional) Name of the column that is the sequence index of this dataset.
        datasets_path (str):
            (Optional) Path to the folder in which a new folder will be created
            for this dataset. Defaults to the current working directory.
    """
    if isinstance(data, str):
        data = pd.read_csv(data)

    base_path = os.path.join(datasets_path, name)
    if os.path.exists(base_path):
        shutil.rmtree(base_path)

    os.makedirs(base_path, exist_ok=True)

    table_name = table_name or name

    cwd = os.getcwd()
    try:
        os.chdir(base_path)
        csv_name = table_name + '.csv'
        data.to_csv(csv_name, index=False)

        metadata = Metadata()
        metadata.add_table(name, csv_name)
        meta_dict = metadata.to_dict()
        table_meta = meta_dict['tables'][table_name]
        table_meta['entity_columns'] = entity_columns or []
        table_meta['sequence_index'] = sequence_index
        table_meta['deepecho_version'] = Dataset.VERSION

        with open('metadata.json', 'w') as metadata_file:
            json.dump(meta_dict, metadata_file, indent=4)

        LOGGER.info('Dataset %s generated in folder %s', name, base_path)

    finally:
        os.chdir(cwd)
예제 #7
0
def load_multi_foreign_key():
    parent = pd.DataFrame({
        'parent_id': range(10),
        'value': range(10)
    })
    child = pd.DataFrame({
        'parent_1_id': range(10),
        'parent_2_id': range(10),
        'value': range(10)
    })

    metadata = Metadata()
    metadata.add_table('parent', parent, primary_key='parent_id')
    metadata.add_table('child', child, parent='parent', foreign_key='parent_1_id')
    metadata.add_relationship('parent', 'child', 'parent_2_id')

    return metadata, {'parent': parent, 'child': child}
예제 #8
0
    def load(dataset, is_path=False):
        """This function loads a SDMetrics dataset which consists of a metadata
        object, a set of real tables, a set of low quality synthetic tables, and
        a set of high quality synthetic tables.

        Arguments:
            dataset (str): The name of the dataset (or the path to the dataset).

        Returns:
            (Dataset): An instance of the Dataset object.
        """
        if is_path:
            path_to_dataset = dataset
        else:
            path_to_dataset = os.path.join(_dir_, dataset)
        metadata = Metadata(os.path.join(path_to_dataset, "metadata.json"))
        tables = Dataset._load_tables(os.path.join(path_to_dataset))
        lq_synthetic = Dataset._load_tables(os.path.join(path_to_dataset, "low_quality"))
        hq_synthetic = Dataset._load_tables(os.path.join(path_to_dataset, "high_quality"))
        return Dataset(metadata, tables, lq_synthetic, hq_synthetic)
예제 #9
0
def load_dataset(dataset, datasets_path=None, bucket=None):
    dataset_path = _get_dataset_path(dataset, datasets_path, bucket)
    metadata = Metadata(str(dataset_path / 'metadata.json'))
    tables = metadata.get_tables()
    if not hasattr(metadata, 'modality'):
        if len(tables) > 1:
            modality = 'multi-table'
        else:
            table = metadata.get_table_meta(tables[0])
            if any(table.get(field) for field in TIMESERIES_FIELDS):
                modality = 'timeseries'
            else:
                modality = 'single-table'

        metadata._metadata['modality'] = modality
        metadata.modality = modality

    if not hasattr(metadata, 'name'):
        metadata._metadata['name'] = dataset_path.name
        metadata.name = dataset_path.name

    return metadata
예제 #10
0
def load_dataset(dataset,
                 datasets_path=None,
                 bucket=None,
                 aws_key=None,
                 aws_secret=None,
                 max_columns=None):
    dataset_path = _get_dataset_path(dataset, datasets_path, bucket, aws_key,
                                     aws_secret)
    with open(dataset_path / 'metadata.json') as metadata_file:
        metadata_content = json.load(metadata_file)

    if max_columns:
        if len(metadata_content['tables']) > 1:
            raise ValueError(
                'max_columns is not supported for multi-table datasets')

        _apply_max_columns_to_metadata(metadata_content, max_columns)

    metadata = Metadata(metadata_content, dataset_path)
    tables = metadata.get_tables()
    if not hasattr(metadata, 'modality'):
        if len(tables) > 1:
            modality = 'multi-table'
        else:
            table = metadata.get_table_meta(tables[0])
            if any(table.get(field) for field in TIMESERIES_FIELDS):
                modality = 'timeseries'
            else:
                modality = 'single-table'

        metadata._metadata['modality'] = modality
        metadata.modality = modality

    if not hasattr(metadata, 'name'):
        metadata._metadata['name'] = dataset_path.name
        metadata.name = dataset_path.name

    return metadata
예제 #11
0
    def __init__(self,
                 dataset,
                 table_name=None,
                 max_entities=None,
                 segment_size=None):
        if os.path.isdir(dataset):
            self.name = os.path.basename(dataset)
            self.table_name = table_name or self.name
            self.metadata = Metadata(os.path.join(dataset, 'metadata.json'))
        else:
            self.name = dataset
            self.table_name = table_name or self.name
            self._load_metadata()

        self._load_table()

        table_meta = self.metadata.get_table_meta(self.table_name)
        self.entity_columns = table_meta.get('entity_columns') or []
        self.sequence_index = table_meta.get('sequence_index')
        if 'context_columns' in table_meta:
            self.context_columns = table_meta['context_columns']
        else:
            self.context_columns = self._get_context_columns()

        self.model_columns = [
            column for column in self.data.columns
            if column not in self.entity_columns + self.context_columns +
            [self.sequence_index]
        ]

        if max_entities:
            self._filter_entities(max_entities)

        if not segment_size:
            self.evaluation_data = self.data
        else:
            self.evaluation_data = self._get_evaluation_data(segment_size)
예제 #12
0
    pd.DataFrame({
        "x": np.random.random(size=size),
        "y": np.random.normal(size=size, loc=10.0)
    })
}
lq_synthetic = {
    "table1":
    pd.DataFrame({
        "x":
        np.random.random(size=size) + np.random.normal(size=size),
        "y":
        np.random.normal(size=size, loc=10.0) + np.random.normal(size=size)
    })
}
hq_synthetic = {
    "table1":
    pd.DataFrame({
        "x":
        np.random.random(size=size) + np.random.normal(size=size) / 10.0,
        "y":
        np.random.normal(size=size, loc=10.0) +
        np.random.normal(size=size) / 10.0
    })
}

metadata = Metadata()
for table_name, df in tables.items():
    metadata.add_table(table_name, data=df)
dataset = Dataset(metadata, tables, lq_synthetic, hq_synthetic)
dataset.save(os.path.dirname(__file__))