def _load_metadata(self): dataset_path = os.path.join(DATA_DIR, self.name) metadata_path = os.path.join(dataset_path, 'metadata.json') try: self.metadata = Metadata(metadata_path) version = self.metadata.get_table_meta( self.table_name)['deepecho_version'] assert version == self.VERSION except Exception: self._download() self.metadata = Metadata(metadata_path)
def _score_dataset(dataset, datasets_path, output): start = datetime.now() try: if datasets_path is None: metadata, tables = load_demo(dataset, metadata=True) else: metadata = Metadata( os.path.join(datasets_path, dataset, 'metadata.json')) tables = metadata.load_tables() sdv = SDV() LOGGER.info('Modeling dataset %s', dataset) sdv.fit(metadata, tables) LOGGER.info('Sampling dataset %s', dataset) sampled = sdv.sample_all(10) LOGGER.info('Evaluating dataset %s', dataset) score = evaluate(sampled, metadata=metadata) LOGGER.info('%s: %s - ELAPSED: %s', dataset, score, datetime.now() - start) output.update({ 'dataset': dataset, 'score': score, }) except Exception as ex: error = '{}: {}'.format(type(ex).__name__, str(ex)) LOGGER.error('%s: %s - ELAPSED: %s', dataset, error, datetime.now() - start) output.update({'dataset': dataset, 'error': error})
def test_evaluate_tables_from_demo(): tables = load_demo(metadata=False) new_meta = Metadata() new_meta.add_table('users', data=tables['users'], primary_key='user_id') new_meta.add_table('sessions', data=tables['sessions'], primary_key='session_id', parent='users', foreign_key='user_id') transactions_fields = { 'timestamp': { 'type': 'datetime', 'format': '%Y-%m-%d' } } new_meta.add_table('transactions', tables['transactions'], fields_metadata=transactions_fields, primary_key='transaction_id', parent='sessions') sdv = SDV() sdv.fit(new_meta, tables=tables) sampled = sdv.sample_all() table_scores = dict() for table in new_meta.get_tables(): table_scores[table] = evaluate(sampled[table], real=tables[table], metadata=new_meta, table_name=table) evaluate(sampled, real=tables, metadata=new_meta)
def test_build_demo_metadata_from_tables(): """Build metadata from the demo tables. Then compare the built metadata with the demo one to make sure that they are the same. """ tables = load_demo(metadata=False) new_meta = Metadata() new_meta.add_table('users', data=tables['users'], primary_key='user_id') new_meta.add_table('sessions', data=tables['sessions'], primary_key='session_id', parent='users', foreign_key='user_id') transactions_fields = { 'timestamp': { 'type': 'datetime', 'format': '%Y-%m-%dT%H:%M' } } new_meta.add_table('transactions', tables['transactions'], fields_metadata=transactions_fields, primary_key='transaction_id', parent='sessions') assert DEMO_METADATA == new_meta.to_dict()
def test_build_demo_metadata_without_tables(): metadata = Metadata() metadata.add_table('users') metadata.add_field('users', 'user_id', 'id', 'integer') metadata.add_field('users', 'country', 'categorical') metadata.add_field('users', 'gender', 'categorical') metadata.add_field('users', 'age', 'numerical', 'integer') metadata.set_primary_key('users', 'user_id') metadata.add_table('sessions') metadata.add_field('sessions', 'session_id', 'id', 'integer') metadata.add_field('sessions', 'user_id', 'id', 'integer') metadata.add_field('sessions', 'device', 'categorical') metadata.add_field('sessions', 'os', 'categorical') metadata.add_field('sessions', 'minutes', 'numerical', 'integer') metadata.set_primary_key('sessions', 'session_id') metadata.add_relationship('users', 'sessions') metadata.add_table('transactions') metadata.add_field('transactions', 'transaction_id', 'id', 'integer') metadata.add_field('transactions', 'session_id', 'id', 'integer') metadata.add_field('transactions', 'timestamp', 'datetime', properties={'format': '%Y-%m-%dT%H:%M'}) metadata.add_field('transactions', 'amount', 'numerical', 'float') metadata.add_field('transactions', 'cancelled', 'boolean') metadata.set_primary_key('transactions', 'transaction_id') metadata.add_relationship('sessions', 'transactions') assert DEMO_METADATA == metadata.to_dict()
def make_dataset(name, data, table_name=None, entity_columns=None, sequence_index=None, datasets_path='.'): """Make a Dataset from a DataFrame. Args: name (str): Name of this dataset. data (pandas.DataFrame or str): Data passed as a DataFrame or as a path to a CSV file. table_name (str or None): Optionally give the table a different name. entity_columns (list or None): (Optional) List of names of the columns that form the entity_id of this dataset. If ``None`` (default), no entity columns are set. sequence_index (str or None): (Optional) Name of the column that is the sequence index of this dataset. datasets_path (str): (Optional) Path to the folder in which a new folder will be created for this dataset. Defaults to the current working directory. """ if isinstance(data, str): data = pd.read_csv(data) base_path = os.path.join(datasets_path, name) if os.path.exists(base_path): shutil.rmtree(base_path) os.makedirs(base_path, exist_ok=True) table_name = table_name or name cwd = os.getcwd() try: os.chdir(base_path) csv_name = table_name + '.csv' data.to_csv(csv_name, index=False) metadata = Metadata() metadata.add_table(name, csv_name) meta_dict = metadata.to_dict() table_meta = meta_dict['tables'][table_name] table_meta['entity_columns'] = entity_columns or [] table_meta['sequence_index'] = sequence_index table_meta['deepecho_version'] = Dataset.VERSION with open('metadata.json', 'w') as metadata_file: json.dump(meta_dict, metadata_file, indent=4) LOGGER.info('Dataset %s generated in folder %s', name, base_path) finally: os.chdir(cwd)
def load_multi_foreign_key(): parent = pd.DataFrame({ 'parent_id': range(10), 'value': range(10) }) child = pd.DataFrame({ 'parent_1_id': range(10), 'parent_2_id': range(10), 'value': range(10) }) metadata = Metadata() metadata.add_table('parent', parent, primary_key='parent_id') metadata.add_table('child', child, parent='parent', foreign_key='parent_1_id') metadata.add_relationship('parent', 'child', 'parent_2_id') return metadata, {'parent': parent, 'child': child}
def load(dataset, is_path=False): """This function loads a SDMetrics dataset which consists of a metadata object, a set of real tables, a set of low quality synthetic tables, and a set of high quality synthetic tables. Arguments: dataset (str): The name of the dataset (or the path to the dataset). Returns: (Dataset): An instance of the Dataset object. """ if is_path: path_to_dataset = dataset else: path_to_dataset = os.path.join(_dir_, dataset) metadata = Metadata(os.path.join(path_to_dataset, "metadata.json")) tables = Dataset._load_tables(os.path.join(path_to_dataset)) lq_synthetic = Dataset._load_tables(os.path.join(path_to_dataset, "low_quality")) hq_synthetic = Dataset._load_tables(os.path.join(path_to_dataset, "high_quality")) return Dataset(metadata, tables, lq_synthetic, hq_synthetic)
def load_dataset(dataset, datasets_path=None, bucket=None): dataset_path = _get_dataset_path(dataset, datasets_path, bucket) metadata = Metadata(str(dataset_path / 'metadata.json')) tables = metadata.get_tables() if not hasattr(metadata, 'modality'): if len(tables) > 1: modality = 'multi-table' else: table = metadata.get_table_meta(tables[0]) if any(table.get(field) for field in TIMESERIES_FIELDS): modality = 'timeseries' else: modality = 'single-table' metadata._metadata['modality'] = modality metadata.modality = modality if not hasattr(metadata, 'name'): metadata._metadata['name'] = dataset_path.name metadata.name = dataset_path.name return metadata
def load_dataset(dataset, datasets_path=None, bucket=None, aws_key=None, aws_secret=None, max_columns=None): dataset_path = _get_dataset_path(dataset, datasets_path, bucket, aws_key, aws_secret) with open(dataset_path / 'metadata.json') as metadata_file: metadata_content = json.load(metadata_file) if max_columns: if len(metadata_content['tables']) > 1: raise ValueError( 'max_columns is not supported for multi-table datasets') _apply_max_columns_to_metadata(metadata_content, max_columns) metadata = Metadata(metadata_content, dataset_path) tables = metadata.get_tables() if not hasattr(metadata, 'modality'): if len(tables) > 1: modality = 'multi-table' else: table = metadata.get_table_meta(tables[0]) if any(table.get(field) for field in TIMESERIES_FIELDS): modality = 'timeseries' else: modality = 'single-table' metadata._metadata['modality'] = modality metadata.modality = modality if not hasattr(metadata, 'name'): metadata._metadata['name'] = dataset_path.name metadata.name = dataset_path.name return metadata
def __init__(self, dataset, table_name=None, max_entities=None, segment_size=None): if os.path.isdir(dataset): self.name = os.path.basename(dataset) self.table_name = table_name or self.name self.metadata = Metadata(os.path.join(dataset, 'metadata.json')) else: self.name = dataset self.table_name = table_name or self.name self._load_metadata() self._load_table() table_meta = self.metadata.get_table_meta(self.table_name) self.entity_columns = table_meta.get('entity_columns') or [] self.sequence_index = table_meta.get('sequence_index') if 'context_columns' in table_meta: self.context_columns = table_meta['context_columns'] else: self.context_columns = self._get_context_columns() self.model_columns = [ column for column in self.data.columns if column not in self.entity_columns + self.context_columns + [self.sequence_index] ] if max_entities: self._filter_entities(max_entities) if not segment_size: self.evaluation_data = self.data else: self.evaluation_data = self._get_evaluation_data(segment_size)
pd.DataFrame({ "x": np.random.random(size=size), "y": np.random.normal(size=size, loc=10.0) }) } lq_synthetic = { "table1": pd.DataFrame({ "x": np.random.random(size=size) + np.random.normal(size=size), "y": np.random.normal(size=size, loc=10.0) + np.random.normal(size=size) }) } hq_synthetic = { "table1": pd.DataFrame({ "x": np.random.random(size=size) + np.random.normal(size=size) / 10.0, "y": np.random.normal(size=size, loc=10.0) + np.random.normal(size=size) / 10.0 }) } metadata = Metadata() for table_name, df in tables.items(): metadata.add_table(table_name, data=df) dataset = Dataset(metadata, tables, lq_synthetic, hq_synthetic) dataset.save(os.path.dirname(__file__))